.text



.globl  sha256_multi_block
.type   sha256_multi_block,@function
.align  32
sha256_multi_block:
.cfi_startproc
        movq    OPENSSL_ia32cap_P+4(%rip),%rcx
        btq     $61,%rcx
        jc      _shaext_shortcut
        testl   $268435456,%ecx
        jnz     _avx_shortcut
        movq    %rsp,%rax
.cfi_def_cfa_register   %rax
        pushq   %rbx
.cfi_offset     %rbx,-16
        pushq   %rbp
.cfi_offset     %rbp,-24
        subq    $288,%rsp
        andq    $-256,%rsp
        movq    %rax,272(%rsp)
.cfi_escape     0x0f,0x06,0x77,0x90,0x02,0x06,0x23,0x08
.Lbody:
        leaq    K256+128(%rip),%rbp
        leaq    256(%rsp),%rbx
        leaq    128(%rdi),%rdi

.Loop_grande:
        movl    %edx,280(%rsp)
        xorl    %edx,%edx

        movq    0(%rsi),%r8

        movl    8(%rsi),%ecx
        cmpl    %edx,%ecx
        cmovgl  %ecx,%edx
        testl   %ecx,%ecx
        movl    %ecx,0(%rbx)
        cmovleq %rbp,%r8

        movq    16(%rsi),%r9

        movl    24(%rsi),%ecx
        cmpl    %edx,%ecx
        cmovgl  %ecx,%edx
        testl   %ecx,%ecx
        movl    %ecx,4(%rbx)
        cmovleq %rbp,%r9

        movq    32(%rsi),%r10

        movl    40(%rsi),%ecx
        cmpl    %edx,%ecx
        cmovgl  %ecx,%edx
        testl   %ecx,%ecx
        movl    %ecx,8(%rbx)
        cmovleq %rbp,%r10

        movq    48(%rsi),%r11

        movl    56(%rsi),%ecx
        cmpl    %edx,%ecx
        cmovgl  %ecx,%edx
        testl   %ecx,%ecx
        movl    %ecx,12(%rbx)
        cmovleq %rbp,%r11
        testl   %edx,%edx
        jz      .Ldone

        movdqu  0-128(%rdi),%xmm8
        leaq    128(%rsp),%rax
        movdqu  32-128(%rdi),%xmm9
        movdqu  64-128(%rdi),%xmm10
        movdqu  96-128(%rdi),%xmm11
        movdqu  128-128(%rdi),%xmm12
        movdqu  160-128(%rdi),%xmm13
        movdqu  192-128(%rdi),%xmm14
        movdqu  224-128(%rdi),%xmm15
        movdqu  .Lpbswap(%rip),%xmm6
        jmp     .Loop

.align  32
.Loop:
        movdqa  %xmm10,%xmm4
        pxor    %xmm9,%xmm4
        movd    0(%r8),%xmm5
        movd    0(%r9),%xmm0
        movd    0(%r10),%xmm1
        movd    0(%r11),%xmm2
        punpckldq       %xmm1,%xmm5
        punpckldq       %xmm2,%xmm0
        punpckldq       %xmm0,%xmm5
        movdqa  %xmm12,%xmm7
.byte   102,15,56,0,238
        movdqa  %xmm12,%xmm2

        psrld   $6,%xmm7
        movdqa  %xmm12,%xmm1
        pslld   $7,%xmm2
        movdqa  %xmm5,0-128(%rax)
        paddd   %xmm15,%xmm5

        psrld   $11,%xmm1
        pxor    %xmm2,%xmm7
        pslld   $21-7,%xmm2
        paddd   -128(%rbp),%xmm5
        pxor    %xmm1,%xmm7

        psrld   $25-11,%xmm1
        movdqa  %xmm12,%xmm0

        pxor    %xmm2,%xmm7
        movdqa  %xmm12,%xmm3
        pslld   $26-21,%xmm2
        pandn   %xmm14,%xmm0
        pand    %xmm13,%xmm3
        pxor    %xmm1,%xmm7


        movdqa  %xmm8,%xmm1
        pxor    %xmm2,%xmm7
        movdqa  %xmm8,%xmm2
        psrld   $2,%xmm1
        paddd   %xmm7,%xmm5
        pxor    %xmm3,%xmm0
        movdqa  %xmm9,%xmm3
        movdqa  %xmm8,%xmm7
        pslld   $10,%xmm2
        pxor    %xmm8,%xmm3


        psrld   $13,%xmm7
        pxor    %xmm2,%xmm1
        paddd   %xmm0,%xmm5
        pslld   $19-10,%xmm2
        pand    %xmm3,%xmm4
        pxor    %xmm7,%xmm1


        psrld   $22-13,%xmm7
        pxor    %xmm2,%xmm1
        movdqa  %xmm9,%xmm15
        pslld   $30-19,%xmm2
        pxor    %xmm1,%xmm7
        pxor    %xmm4,%xmm15
        paddd   %xmm5,%xmm11
        pxor    %xmm2,%xmm7

        paddd   %xmm5,%xmm15
        paddd   %xmm7,%xmm15
        movd    4(%r8),%xmm5
        movd    4(%r9),%xmm0
        movd    4(%r10),%xmm1
        movd    4(%r11),%xmm2
        punpckldq       %xmm1,%xmm5
        punpckldq       %xmm2,%xmm0
        punpckldq       %xmm0,%xmm5
        movdqa  %xmm11,%xmm7

        movdqa  %xmm11,%xmm2
.byte   102,15,56,0,238
        psrld   $6,%xmm7
        movdqa  %xmm11,%xmm1
        pslld   $7,%xmm2
        movdqa  %xmm5,16-128(%rax)
        paddd   %xmm14,%xmm5

        psrld   $11,%xmm1
        pxor    %xmm2,%xmm7
        pslld   $21-7,%xmm2
        paddd   -96(%rbp),%xmm5
        pxor    %xmm1,%xmm7

        psrld   $25-11,%xmm1
        movdqa  %xmm11,%xmm0

        pxor    %xmm2,%xmm7
        movdqa  %xmm11,%xmm4
        pslld   $26-21,%xmm2
        pandn   %xmm13,%xmm0
        pand    %xmm12,%xmm4
        pxor    %xmm1,%xmm7


        movdqa  %xmm15,%xmm1
        pxor    %xmm2,%xmm7
        movdqa  %xmm15,%xmm2
        psrld   $2,%xmm1
        paddd   %xmm7,%xmm5
        pxor    %xmm4,%xmm0
        movdqa  %xmm8,%xmm4
        movdqa  %xmm15,%xmm7
        pslld   $10,%xmm2
        pxor    %xmm15,%xmm4


        psrld   $13,%xmm7
        pxor    %xmm2,%xmm1
        paddd   %xmm0,%xmm5
        pslld   $19-10,%xmm2
        pand    %xmm4,%xmm3
        pxor    %xmm7,%xmm1


        psrld   $22-13,%xmm7
        pxor    %xmm2,%xmm1
        movdqa  %xmm8,%xmm14
        pslld   $30-19,%xmm2
        pxor    %xmm1,%xmm7
        pxor    %xmm3,%xmm14
        paddd   %xmm5,%xmm10
        pxor    %xmm2,%xmm7

        paddd   %xmm5,%xmm14
        paddd   %xmm7,%xmm14
        movd    8(%r8),%xmm5
        movd    8(%r9),%xmm0
        movd    8(%r10),%xmm1
        movd    8(%r11),%xmm2
        punpckldq       %xmm1,%xmm5
        punpckldq       %xmm2,%xmm0
        punpckldq       %xmm0,%xmm5
        movdqa  %xmm10,%xmm7
.byte   102,15,56,0,238
        movdqa  %xmm10,%xmm2

        psrld   $6,%xmm7
        movdqa  %xmm10,%xmm1
        pslld   $7,%xmm2
        movdqa  %xmm5,32-128(%rax)
        paddd   %xmm13,%xmm5

        psrld   $11,%xmm1
        pxor    %xmm2,%xmm7
        pslld   $21-7,%xmm2
        paddd   -64(%rbp),%xmm5
        pxor    %xmm1,%xmm7

        psrld   $25-11,%xmm1
        movdqa  %xmm10,%xmm0

        pxor    %xmm2,%xmm7
        movdqa  %xmm10,%xmm3
        pslld   $26-21,%xmm2
        pandn   %xmm12,%xmm0
        pand    %xmm11,%xmm3
        pxor    %xmm1,%xmm7


        movdqa  %xmm14,%xmm1
        pxor    %xmm2,%xmm7
        movdqa  %xmm14,%xmm2
        psrld   $2,%xmm1
        paddd   %xmm7,%xmm5
        pxor    %xmm3,%xmm0
        movdqa  %xmm15,%xmm3
        movdqa  %xmm14,%xmm7
        pslld   $10,%xmm2
        pxor    %xmm14,%xmm3


        psrld   $13,%xmm7
        pxor    %xmm2,%xmm1
        paddd   %xmm0,%xmm5
        pslld   $19-10,%xmm2
        pand    %xmm3,%xmm4
        pxor    %xmm7,%xmm1


        psrld   $22-13,%xmm7
        pxor    %xmm2,%xmm1
        movdqa  %xmm15,%xmm13
        pslld   $30-19,%xmm2
        pxor    %xmm1,%xmm7
        pxor    %xmm4,%xmm13
        paddd   %xmm5,%xmm9
        pxor    %xmm2,%xmm7

        paddd   %xmm5,%xmm13
        paddd   %xmm7,%xmm13
        movd    12(%r8),%xmm5
        movd    12(%r9),%xmm0
        movd    12(%r10),%xmm1
        movd    12(%r11),%xmm2
        punpckldq       %xmm1,%xmm5
        punpckldq       %xmm2,%xmm0
        punpckldq       %xmm0,%xmm5
        movdqa  %xmm9,%xmm7

        movdqa  %xmm9,%xmm2
.byte   102,15,56,0,238
        psrld   $6,%xmm7
        movdqa  %xmm9,%xmm1
        pslld   $7,%xmm2
        movdqa  %xmm5,48-128(%rax)
        paddd   %xmm12,%xmm5

        psrld   $11,%xmm1
        pxor    %xmm2,%xmm7
        pslld   $21-7,%xmm2
        paddd   -32(%rbp),%xmm5
        pxor    %xmm1,%xmm7

        psrld   $25-11,%xmm1
        movdqa  %xmm9,%xmm0

        pxor    %xmm2,%xmm7
        movdqa  %xmm9,%xmm4
        pslld   $26-21,%xmm2
        pandn   %xmm11,%xmm0
        pand    %xmm10,%xmm4
        pxor    %xmm1,%xmm7


        movdqa  %xmm13,%xmm1
        pxor    %xmm2,%xmm7
        movdqa  %xmm13,%xmm2
        psrld   $2,%xmm1
        paddd   %xmm7,%xmm5
        pxor    %xmm4,%xmm0
        movdqa  %xmm14,%xmm4
        movdqa  %xmm13,%xmm7
        pslld   $10,%xmm2
        pxor    %xmm13,%xmm4


        psrld   $13,%xmm7
        pxor    %xmm2,%xmm1
        paddd   %xmm0,%xmm5
        pslld   $19-10,%xmm2
        pand    %xmm4,%xmm3
        pxor    %xmm7,%xmm1


        psrld   $22-13,%xmm7
        pxor    %xmm2,%xmm1
        movdqa  %xmm14,%xmm12
        pslld   $30-19,%xmm2
        pxor    %xmm1,%xmm7
        pxor    %xmm3,%xmm12
        paddd   %xmm5,%xmm8
        pxor    %xmm2,%xmm7

        paddd   %xmm5,%xmm12
        paddd   %xmm7,%xmm12
        movd    16(%r8),%xmm5
        movd    16(%r9),%xmm0
        movd    16(%r10),%xmm1
        movd    16(%r11),%xmm2
        punpckldq       %xmm1,%xmm5
        punpckldq       %xmm2,%xmm0
        punpckldq       %xmm0,%xmm5
        movdqa  %xmm8,%xmm7
.byte   102,15,56,0,238
        movdqa  %xmm8,%xmm2

        psrld   $6,%xmm7
        movdqa  %xmm8,%xmm1
        pslld   $7,%xmm2
        movdqa  %xmm5,64-128(%rax)
        paddd   %xmm11,%xmm5

        psrld   $11,%xmm1
        pxor    %xmm2,%xmm7
        pslld   $21-7,%xmm2
        paddd   0(%rbp),%xmm5
        pxor    %xmm1,%xmm7

        psrld   $25-11,%xmm1
        movdqa  %xmm8,%xmm0

        pxor    %xmm2,%xmm7
        movdqa  %xmm8,%xmm3
        pslld   $26-21,%xmm2
        pandn   %xmm10,%xmm0
        pand    %xmm9,%xmm3
        pxor    %xmm1,%xmm7


        movdqa  %xmm12,%xmm1
        pxor    %xmm2,%xmm7
        movdqa  %xmm12,%xmm2
        psrld   $2,%xmm1
        paddd   %xmm7,%xmm5
        pxor    %xmm3,%xmm0
        movdqa  %xmm13,%xmm3
        movdqa  %xmm12,%xmm7
        pslld   $10,%xmm2
        pxor    %xmm12,%xmm3


        psrld   $13,%xmm7
        pxor    %xmm2,%xmm1
        paddd   %xmm0,%xmm5
        pslld   $19-10,%xmm2
        pand    %xmm3,%xmm4
        pxor    %xmm7,%xmm1


        psrld   $22-13,%xmm7
        pxor    %xmm2,%xmm1
        movdqa  %xmm13,%xmm11
        pslld   $30-19,%xmm2
        pxor    %xmm1,%xmm7
        pxor    %xmm4,%xmm11
        paddd   %xmm5,%xmm15
        pxor    %xmm2,%xmm7

        paddd   %xmm5,%xmm11
        paddd   %xmm7,%xmm11
        movd    20(%r8),%xmm5
        movd    20(%r9),%xmm0
        movd    20(%r10),%xmm1
        movd    20(%r11),%xmm2
        punpckldq       %xmm1,%xmm5
        punpckldq       %xmm2,%xmm0
        punpckldq       %xmm0,%xmm5
        movdqa  %xmm15,%xmm7

        movdqa  %xmm15,%xmm2
.byte   102,15,56,0,238
        psrld   $6,%xmm7
        movdqa  %xmm15,%xmm1
        pslld   $7,%xmm2
        movdqa  %xmm5,80-128(%rax)
        paddd   %xmm10,%xmm5

        psrld   $11,%xmm1
        pxor    %xmm2,%xmm7
        pslld   $21-7,%xmm2
        paddd   32(%rbp),%xmm5
        pxor    %xmm1,%xmm7

        psrld   $25-11,%xmm1
        movdqa  %xmm15,%xmm0

        pxor    %xmm2,%xmm7
        movdqa  %xmm15,%xmm4
        pslld   $26-21,%xmm2
        pandn   %xmm9,%xmm0
        pand    %xmm8,%xmm4
        pxor    %xmm1,%xmm7


        movdqa  %xmm11,%xmm1
        pxor    %xmm2,%xmm7
        movdqa  %xmm11,%xmm2
        psrld   $2,%xmm1
        paddd   %xmm7,%xmm5
        pxor    %xmm4,%xmm0
        movdqa  %xmm12,%xmm4
        movdqa  %xmm11,%xmm7
        pslld   $10,%xmm2
        pxor    %xmm11,%xmm4


        psrld   $13,%xmm7
        pxor    %xmm2,%xmm1
        paddd   %xmm0,%xmm5
        pslld   $19-10,%xmm2
        pand    %xmm4,%xmm3
        pxor    %xmm7,%xmm1


        psrld   $22-13,%xmm7
        pxor    %xmm2,%xmm1
        movdqa  %xmm12,%xmm10
        pslld   $30-19,%xmm2
        pxor    %xmm1,%xmm7
        pxor    %xmm3,%xmm10
        paddd   %xmm5,%xmm14
        pxor    %xmm2,%xmm7

        paddd   %xmm5,%xmm10
        paddd   %xmm7,%xmm10
        movd    24(%r8),%xmm5
        movd    24(%r9),%xmm0
        movd    24(%r10),%xmm1
        movd    24(%r11),%xmm2
        punpckldq       %xmm1,%xmm5
        punpckldq       %xmm2,%xmm0
        punpckldq       %xmm0,%xmm5
        movdqa  %xmm14,%xmm7
.byte   102,15,56,0,238
        movdqa  %xmm14,%xmm2

        psrld   $6,%xmm7
        movdqa  %xmm14,%xmm1
        pslld   $7,%xmm2
        movdqa  %xmm5,96-128(%rax)
        paddd   %xmm9,%xmm5

        psrld   $11,%xmm1
        pxor    %xmm2,%xmm7
        pslld   $21-7,%xmm2
        paddd   64(%rbp),%xmm5
        pxor    %xmm1,%xmm7

        psrld   $25-11,%xmm1
        movdqa  %xmm14,%xmm0

        pxor    %xmm2,%xmm7
        movdqa  %xmm14,%xmm3
        pslld   $26-21,%xmm2
        pandn   %xmm8,%xmm0
        pand    %xmm15,%xmm3
        pxor    %xmm1,%xmm7


        movdqa  %xmm10,%xmm1
        pxor    %xmm2,%xmm7
        movdqa  %xmm10,%xmm2
        psrld   $2,%xmm1
        paddd   %xmm7,%xmm5
        pxor    %xmm3,%xmm0
        movdqa  %xmm11,%xmm3
        movdqa  %xmm10,%xmm7
        pslld   $10,%xmm2
        pxor    %xmm10,%xmm3


        psrld   $13,%xmm7
        pxor    %xmm2,%xmm1
        paddd   %xmm0,%xmm5
        pslld   $19-10,%xmm2
        pand    %xmm3,%xmm4
        pxor    %xmm7,%xmm1


        psrld   $22-13,%xmm7
        pxor    %xmm2,%xmm1
        movdqa  %xmm11,%xmm9
        pslld   $30-19,%xmm2
        pxor    %xmm1,%xmm7
        pxor    %xmm4,%xmm9
        paddd   %xmm5,%xmm13
        pxor    %xmm2,%xmm7

        paddd   %xmm5,%xmm9
        paddd   %xmm7,%xmm9
        movd    28(%r8),%xmm5
        movd    28(%r9),%xmm0
        movd    28(%r10),%xmm1
        movd    28(%r11),%xmm2
        punpckldq       %xmm1,%xmm5
        punpckldq       %xmm2,%xmm0
        punpckldq       %xmm0,%xmm5
        movdqa  %xmm13,%xmm7

        movdqa  %xmm13,%xmm2
.byte   102,15,56,0,238
        psrld   $6,%xmm7
        movdqa  %xmm13,%xmm1
        pslld   $7,%xmm2
        movdqa  %xmm5,112-128(%rax)
        paddd   %xmm8,%xmm5

        psrld   $11,%xmm1
        pxor    %xmm2,%xmm7
        pslld   $21-7,%xmm2
        paddd   96(%rbp),%xmm5
        pxor    %xmm1,%xmm7

        psrld   $25-11,%xmm1
        movdqa  %xmm13,%xmm0

        pxor    %xmm2,%xmm7
        movdqa  %xmm13,%xmm4
        pslld   $26-21,%xmm2
        pandn   %xmm15,%xmm0
        pand    %xmm14,%xmm4
        pxor    %xmm1,%xmm7


        movdqa  %xmm9,%xmm1
        pxor    %xmm2,%xmm7
        movdqa  %xmm9,%xmm2
        psrld   $2,%xmm1
        paddd   %xmm7,%xmm5
        pxor    %xmm4,%xmm0
        movdqa  %xmm10,%xmm4
        movdqa  %xmm9,%xmm7
        pslld   $10,%xmm2
        pxor    %xmm9,%xmm4


        psrld   $13,%xmm7
        pxor    %xmm2,%xmm1
        paddd   %xmm0,%xmm5
        pslld   $19-10,%xmm2
        pand    %xmm4,%xmm3
        pxor    %xmm7,%xmm1


        psrld   $22-13,%xmm7
        pxor    %xmm2,%xmm1
        movdqa  %xmm10,%xmm8
        pslld   $30-19,%xmm2
        pxor    %xmm1,%xmm7
        pxor    %xmm3,%xmm8
        paddd   %xmm5,%xmm12
        pxor    %xmm2,%xmm7

        paddd   %xmm5,%xmm8
        paddd   %xmm7,%xmm8
        leaq    256(%rbp),%rbp
        movd    32(%r8),%xmm5
        movd    32(%r9),%xmm0
        movd    32(%r10),%xmm1
        movd    32(%r11),%xmm2
        punpckldq       %xmm1,%xmm5
        punpckldq       %xmm2,%xmm0
        punpckldq       %xmm0,%xmm5
        movdqa  %xmm12,%xmm7
.byte   102,15,56,0,238
        movdqa  %xmm12,%xmm2

        psrld   $6,%xmm7
        movdqa  %xmm12,%xmm1
        pslld   $7,%xmm2
        movdqa  %xmm5,128-128(%rax)
        paddd   %xmm15,%xmm5

        psrld   $11,%xmm1
        pxor    %xmm2,%xmm7
        pslld   $21-7,%xmm2
        paddd   -128(%rbp),%xmm5
        pxor    %xmm1,%xmm7

        psrld   $25-11,%xmm1
        movdqa  %xmm12,%xmm0

        pxor    %xmm2,%xmm7
        movdqa  %xmm12,%xmm3
        pslld   $26-21,%xmm2
        pandn   %xmm14,%xmm0
        pand    %xmm13,%xmm3
        pxor    %xmm1,%xmm7


        movdqa  %xmm8,%xmm1
        pxor    %xmm2,%xmm7
        movdqa  %xmm8,%xmm2
        psrld   $2,%xmm1
        paddd   %xmm7,%xmm5
        pxor    %xmm3,%xmm0
        movdqa  %xmm9,%xmm3
        movdqa  %xmm8,%xmm7
        pslld   $10,%xmm2
        pxor    %xmm8,%xmm3


        psrld   $13,%xmm7
        pxor    %xmm2,%xmm1
        paddd   %xmm0,%xmm5
        pslld   $19-10,%xmm2
        pand    %xmm3,%xmm4
        pxor    %xmm7,%xmm1


        psrld   $22-13,%xmm7
        pxor    %xmm2,%xmm1
        movdqa  %xmm9,%xmm15
        pslld   $30-19,%xmm2
        pxor    %xmm1,%xmm7
        pxor    %xmm4,%xmm15
        paddd   %xmm5,%xmm11
        pxor    %xmm2,%xmm7

        paddd   %xmm5,%xmm15
        paddd   %xmm7,%xmm15
        movd    36(%r8),%xmm5
        movd    36(%r9),%xmm0
        movd    36(%r10),%xmm1
        movd    36(%r11),%xmm2
        punpckldq       %xmm1,%xmm5
        punpckldq       %xmm2,%xmm0
        punpckldq       %xmm0,%xmm5
        movdqa  %xmm11,%xmm7

        movdqa  %xmm11,%xmm2
.byte   102,15,56,0,238
        psrld   $6,%xmm7
        movdqa  %xmm11,%xmm1
        pslld   $7,%xmm2
        movdqa  %xmm5,144-128(%rax)
        paddd   %xmm14,%xmm5

        psrld   $11,%xmm1
        pxor    %xmm2,%xmm7
        pslld   $21-7,%xmm2
        paddd   -96(%rbp),%xmm5
        pxor    %xmm1,%xmm7

        psrld   $25-11,%xmm1
        movdqa  %xmm11,%xmm0

        pxor    %xmm2,%xmm7
        movdqa  %xmm11,%xmm4
        pslld   $26-21,%xmm2
        pandn   %xmm13,%xmm0
        pand    %xmm12,%xmm4
        pxor    %xmm1,%xmm7


        movdqa  %xmm15,%xmm1
        pxor    %xmm2,%xmm7
        movdqa  %xmm15,%xmm2
        psrld   $2,%xmm1
        paddd   %xmm7,%xmm5
        pxor    %xmm4,%xmm0
        movdqa  %xmm8,%xmm4
        movdqa  %xmm15,%xmm7
        pslld   $10,%xmm2
        pxor    %xmm15,%xmm4


        psrld   $13,%xmm7
        pxor    %xmm2,%xmm1
        paddd   %xmm0,%xmm5
        pslld   $19-10,%xmm2
        pand    %xmm4,%xmm3
        pxor    %xmm7,%xmm1


        psrld   $22-13,%xmm7
        pxor    %xmm2,%xmm1
        movdqa  %xmm8,%xmm14
        pslld   $30-19,%xmm2
        pxor    %xmm1,%xmm7
        pxor    %xmm3,%xmm14
        paddd   %xmm5,%xmm10
        pxor    %xmm2,%xmm7

        paddd   %xmm5,%xmm14
        paddd   %xmm7,%xmm14
        movd    40(%r8),%xmm5
        movd    40(%r9),%xmm0
        movd    40(%r10),%xmm1
        movd    40(%r11),%xmm2
        punpckldq       %xmm1,%xmm5
        punpckldq       %xmm2,%xmm0
        punpckldq       %xmm0,%xmm5
        movdqa  %xmm10,%xmm7
.byte   102,15,56,0,238
        movdqa  %xmm10,%xmm2

        psrld   $6,%xmm7
        movdqa  %xmm10,%xmm1
        pslld   $7,%xmm2
        movdqa  %xmm5,160-128(%rax)
        paddd   %xmm13,%xmm5

        psrld   $11,%xmm1
        pxor    %xmm2,%xmm7
        pslld   $21-7,%xmm2
        paddd   -64(%rbp),%xmm5
        pxor    %xmm1,%xmm7

        psrld   $25-11,%xmm1
        movdqa  %xmm10,%xmm0

        pxor    %xmm2,%xmm7
        movdqa  %xmm10,%xmm3
        pslld   $26-21,%xmm2
        pandn   %xmm12,%xmm0
        pand    %xmm11,%xmm3
        pxor    %xmm1,%xmm7


        movdqa  %xmm14,%xmm1
        pxor    %xmm2,%xmm7
        movdqa  %xmm14,%xmm2
        psrld   $2,%xmm1
        paddd   %xmm7,%xmm5
        pxor    %xmm3,%xmm0
        movdqa  %xmm15,%xmm3
        movdqa  %xmm14,%xmm7
        pslld   $10,%xmm2
        pxor    %xmm14,%xmm3


        psrld   $13,%xmm7
        pxor    %xmm2,%xmm1
        paddd   %xmm0,%xmm5
        pslld   $19-10,%xmm2
        pand    %xmm3,%xmm4
        pxor    %xmm7,%xmm1


        psrld   $22-13,%xmm7
        pxor    %xmm2,%xmm1
        movdqa  %xmm15,%xmm13
        pslld   $30-19,%xmm2
        pxor    %xmm1,%xmm7
        pxor    %xmm4,%xmm13
        paddd   %xmm5,%xmm9
        pxor    %xmm2,%xmm7

        paddd   %xmm5,%xmm13
        paddd   %xmm7,%xmm13
        movd    44(%r8),%xmm5
        movd    44(%r9),%xmm0
        movd    44(%r10),%xmm1
        movd    44(%r11),%xmm2
        punpckldq       %xmm1,%xmm5
        punpckldq       %xmm2,%xmm0
        punpckldq       %xmm0,%xmm5
        movdqa  %xmm9,%xmm7

        movdqa  %xmm9,%xmm2
.byte   102,15,56,0,238
        psrld   $6,%xmm7
        movdqa  %xmm9,%xmm1
        pslld   $7,%xmm2
        movdqa  %xmm5,176-128(%rax)
        paddd   %xmm12,%xmm5

        psrld   $11,%xmm1
        pxor    %xmm2,%xmm7
        pslld   $21-7,%xmm2
        paddd   -32(%rbp),%xmm5
        pxor    %xmm1,%xmm7

        psrld   $25-11,%xmm1
        movdqa  %xmm9,%xmm0

        pxor    %xmm2,%xmm7
        movdqa  %xmm9,%xmm4
        pslld   $26-21,%xmm2
        pandn   %xmm11,%xmm0
        pand    %xmm10,%xmm4
        pxor    %xmm1,%xmm7


        movdqa  %xmm13,%xmm1
        pxor    %xmm2,%xmm7
        movdqa  %xmm13,%xmm2
        psrld   $2,%xmm1
        paddd   %xmm7,%xmm5
        pxor    %xmm4,%xmm0
        movdqa  %xmm14,%xmm4
        movdqa  %xmm13,%xmm7
        pslld   $10,%xmm2
        pxor    %xmm13,%xmm4


        psrld   $13,%xmm7
        pxor    %xmm2,%xmm1
        paddd   %xmm0,%xmm5
        pslld   $19-10,%xmm2
        pand    %xmm4,%xmm3
        pxor    %xmm7,%xmm1


        psrld   $22-13,%xmm7
        pxor    %xmm2,%xmm1
        movdqa  %xmm14,%xmm12
        pslld   $30-19,%xmm2
        pxor    %xmm1,%xmm7
        pxor    %xmm3,%xmm12
        paddd   %xmm5,%xmm8
        pxor    %xmm2,%xmm7

        paddd   %xmm5,%xmm12
        paddd   %xmm7,%xmm12
        movd    48(%r8),%xmm5
        movd    48(%r9),%xmm0
        movd    48(%r10),%xmm1
        movd    48(%r11),%xmm2
        punpckldq       %xmm1,%xmm5
        punpckldq       %xmm2,%xmm0
        punpckldq       %xmm0,%xmm5
        movdqa  %xmm8,%xmm7
.byte   102,15,56,0,238
        movdqa  %xmm8,%xmm2

        psrld   $6,%xmm7
        movdqa  %xmm8,%xmm1
        pslld   $7,%xmm2
        movdqa  %xmm5,192-128(%rax)
        paddd   %xmm11,%xmm5

        psrld   $11,%xmm1
        pxor    %xmm2,%xmm7
        pslld   $21-7,%xmm2
        paddd   0(%rbp),%xmm5
        pxor    %xmm1,%xmm7

        psrld   $25-11,%xmm1
        movdqa  %xmm8,%xmm0

        pxor    %xmm2,%xmm7
        movdqa  %xmm8,%xmm3
        pslld   $26-21,%xmm2
        pandn   %xmm10,%xmm0
        pand    %xmm9,%xmm3
        pxor    %xmm1,%xmm7


        movdqa  %xmm12,%xmm1
        pxor    %xmm2,%xmm7
        movdqa  %xmm12,%xmm2
        psrld   $2,%xmm1
        paddd   %xmm7,%xmm5
        pxor    %xmm3,%xmm0
        movdqa  %xmm13,%xmm3
        movdqa  %xmm12,%xmm7
        pslld   $10,%xmm2
        pxor    %xmm12,%xmm3


        psrld   $13,%xmm7
        pxor    %xmm2,%xmm1
        paddd   %xmm0,%xmm5
        pslld   $19-10,%xmm2
        pand    %xmm3,%xmm4
        pxor    %xmm7,%xmm1


        psrld   $22-13,%xmm7
        pxor    %xmm2,%xmm1
        movdqa  %xmm13,%xmm11
        pslld   $30-19,%xmm2
        pxor    %xmm1,%xmm7
        pxor    %xmm4,%xmm11
        paddd   %xmm5,%xmm15
        pxor    %xmm2,%xmm7

        paddd   %xmm5,%xmm11
        paddd   %xmm7,%xmm11
        movd    52(%r8),%xmm5
        movd    52(%r9),%xmm0
        movd    52(%r10),%xmm1
        movd    52(%r11),%xmm2
        punpckldq       %xmm1,%xmm5
        punpckldq       %xmm2,%xmm0
        punpckldq       %xmm0,%xmm5
        movdqa  %xmm15,%xmm7

        movdqa  %xmm15,%xmm2
.byte   102,15,56,0,238
        psrld   $6,%xmm7
        movdqa  %xmm15,%xmm1
        pslld   $7,%xmm2
        movdqa  %xmm5,208-128(%rax)
        paddd   %xmm10,%xmm5

        psrld   $11,%xmm1
        pxor    %xmm2,%xmm7
        pslld   $21-7,%xmm2
        paddd   32(%rbp),%xmm5
        pxor    %xmm1,%xmm7

        psrld   $25-11,%xmm1
        movdqa  %xmm15,%xmm0

        pxor    %xmm2,%xmm7
        movdqa  %xmm15,%xmm4
        pslld   $26-21,%xmm2
        pandn   %xmm9,%xmm0
        pand    %xmm8,%xmm4
        pxor    %xmm1,%xmm7


        movdqa  %xmm11,%xmm1
        pxor    %xmm2,%xmm7
        movdqa  %xmm11,%xmm2
        psrld   $2,%xmm1
        paddd   %xmm7,%xmm5
        pxor    %xmm4,%xmm0
        movdqa  %xmm12,%xmm4
        movdqa  %xmm11,%xmm7
        pslld   $10,%xmm2
        pxor    %xmm11,%xmm4


        psrld   $13,%xmm7
        pxor    %xmm2,%xmm1
        paddd   %xmm0,%xmm5
        pslld   $19-10,%xmm2
        pand    %xmm4,%xmm3
        pxor    %xmm7,%xmm1


        psrld   $22-13,%xmm7
        pxor    %xmm2,%xmm1
        movdqa  %xmm12,%xmm10
        pslld   $30-19,%xmm2
        pxor    %xmm1,%xmm7
        pxor    %xmm3,%xmm10
        paddd   %xmm5,%xmm14
        pxor    %xmm2,%xmm7

        paddd   %xmm5,%xmm10
        paddd   %xmm7,%xmm10
        movd    56(%r8),%xmm5
        movd    56(%r9),%xmm0
        movd    56(%r10),%xmm1
        movd    56(%r11),%xmm2
        punpckldq       %xmm1,%xmm5
        punpckldq       %xmm2,%xmm0
        punpckldq       %xmm0,%xmm5
        movdqa  %xmm14,%xmm7
.byte   102,15,56,0,238
        movdqa  %xmm14,%xmm2

        psrld   $6,%xmm7
        movdqa  %xmm14,%xmm1
        pslld   $7,%xmm2
        movdqa  %xmm5,224-128(%rax)
        paddd   %xmm9,%xmm5

        psrld   $11,%xmm1
        pxor    %xmm2,%xmm7
        pslld   $21-7,%xmm2
        paddd   64(%rbp),%xmm5
        pxor    %xmm1,%xmm7

        psrld   $25-11,%xmm1
        movdqa  %xmm14,%xmm0

        pxor    %xmm2,%xmm7
        movdqa  %xmm14,%xmm3
        pslld   $26-21,%xmm2
        pandn   %xmm8,%xmm0
        pand    %xmm15,%xmm3
        pxor    %xmm1,%xmm7


        movdqa  %xmm10,%xmm1
        pxor    %xmm2,%xmm7
        movdqa  %xmm10,%xmm2
        psrld   $2,%xmm1
        paddd   %xmm7,%xmm5
        pxor    %xmm3,%xmm0
        movdqa  %xmm11,%xmm3
        movdqa  %xmm10,%xmm7
        pslld   $10,%xmm2
        pxor    %xmm10,%xmm3


        psrld   $13,%xmm7
        pxor    %xmm2,%xmm1
        paddd   %xmm0,%xmm5
        pslld   $19-10,%xmm2
        pand    %xmm3,%xmm4
        pxor    %xmm7,%xmm1


        psrld   $22-13,%xmm7
        pxor    %xmm2,%xmm1
        movdqa  %xmm11,%xmm9
        pslld   $30-19,%xmm2
        pxor    %xmm1,%xmm7
        pxor    %xmm4,%xmm9
        paddd   %xmm5,%xmm13
        pxor    %xmm2,%xmm7

        paddd   %xmm5,%xmm9
        paddd   %xmm7,%xmm9
        movd    60(%r8),%xmm5
        leaq    64(%r8),%r8
        movd    60(%r9),%xmm0
        leaq    64(%r9),%r9
        movd    60(%r10),%xmm1
        leaq    64(%r10),%r10
        movd    60(%r11),%xmm2
        leaq    64(%r11),%r11
        punpckldq       %xmm1,%xmm5
        punpckldq       %xmm2,%xmm0
        punpckldq       %xmm0,%xmm5
        movdqa  %xmm13,%xmm7

        movdqa  %xmm13,%xmm2
.byte   102,15,56,0,238
        psrld   $6,%xmm7
        movdqa  %xmm13,%xmm1
        pslld   $7,%xmm2
        movdqa  %xmm5,240-128(%rax)
        paddd   %xmm8,%xmm5

        psrld   $11,%xmm1
        pxor    %xmm2,%xmm7
        pslld   $21-7,%xmm2
        paddd   96(%rbp),%xmm5
        pxor    %xmm1,%xmm7

        psrld   $25-11,%xmm1
        movdqa  %xmm13,%xmm0
        prefetcht0      63(%r8)
        pxor    %xmm2,%xmm7
        movdqa  %xmm13,%xmm4
        pslld   $26-21,%xmm2
        pandn   %xmm15,%xmm0
        pand    %xmm14,%xmm4
        pxor    %xmm1,%xmm7

        prefetcht0      63(%r9)
        movdqa  %xmm9,%xmm1
        pxor    %xmm2,%xmm7
        movdqa  %xmm9,%xmm2
        psrld   $2,%xmm1
        paddd   %xmm7,%xmm5
        pxor    %xmm4,%xmm0
        movdqa  %xmm10,%xmm4
        movdqa  %xmm9,%xmm7
        pslld   $10,%xmm2
        pxor    %xmm9,%xmm4

        prefetcht0      63(%r10)
        psrld   $13,%xmm7
        pxor    %xmm2,%xmm1
        paddd   %xmm0,%xmm5
        pslld   $19-10,%xmm2
        pand    %xmm4,%xmm3
        pxor    %xmm7,%xmm1

        prefetcht0      63(%r11)
        psrld   $22-13,%xmm7
        pxor    %xmm2,%xmm1
        movdqa  %xmm10,%xmm8
        pslld   $30-19,%xmm2
        pxor    %xmm1,%xmm7
        pxor    %xmm3,%xmm8
        paddd   %xmm5,%xmm12
        pxor    %xmm2,%xmm7

        paddd   %xmm5,%xmm8
        paddd   %xmm7,%xmm8
        leaq    256(%rbp),%rbp
        movdqu  0-128(%rax),%xmm5
        movl    $3,%ecx
        jmp     .Loop_16_xx
.align  32
.Loop_16_xx:
        movdqa  16-128(%rax),%xmm6
        paddd   144-128(%rax),%xmm5

        movdqa  %xmm6,%xmm7
        movdqa  %xmm6,%xmm1
        psrld   $3,%xmm7
        movdqa  %xmm6,%xmm2

        psrld   $7,%xmm1
        movdqa  224-128(%rax),%xmm0
        pslld   $14,%xmm2
        pxor    %xmm1,%xmm7
        psrld   $18-7,%xmm1
        movdqa  %xmm0,%xmm3
        pxor    %xmm2,%xmm7
        pslld   $25-14,%xmm2
        pxor    %xmm1,%xmm7
        psrld   $10,%xmm0
        movdqa  %xmm3,%xmm1

        psrld   $17,%xmm3
        pxor    %xmm2,%xmm7
        pslld   $13,%xmm1
        paddd   %xmm7,%xmm5
        pxor    %xmm3,%xmm0
        psrld   $19-17,%xmm3
        pxor    %xmm1,%xmm0
        pslld   $15-13,%xmm1
        pxor    %xmm3,%xmm0
        pxor    %xmm1,%xmm0
        paddd   %xmm0,%xmm5
        movdqa  %xmm12,%xmm7

        movdqa  %xmm12,%xmm2

        psrld   $6,%xmm7
        movdqa  %xmm12,%xmm1
        pslld   $7,%xmm2
        movdqa  %xmm5,0-128(%rax)
        paddd   %xmm15,%xmm5

        psrld   $11,%xmm1
        pxor    %xmm2,%xmm7
        pslld   $21-7,%xmm2
        paddd   -128(%rbp),%xmm5
        pxor    %xmm1,%xmm7

        psrld   $25-11,%xmm1
        movdqa  %xmm12,%xmm0

        pxor    %xmm2,%xmm7
        movdqa  %xmm12,%xmm3
        pslld   $26-21,%xmm2
        pandn   %xmm14,%xmm0
        pand    %xmm13,%xmm3
        pxor    %xmm1,%xmm7


        movdqa  %xmm8,%xmm1
        pxor    %xmm2,%xmm7
        movdqa  %xmm8,%xmm2
        psrld   $2,%xmm1
        paddd   %xmm7,%xmm5
        pxor    %xmm3,%xmm0
        movdqa  %xmm9,%xmm3
        movdqa  %xmm8,%xmm7
        pslld   $10,%xmm2
        pxor    %xmm8,%xmm3


        psrld   $13,%xmm7
        pxor    %xmm2,%xmm1
        paddd   %xmm0,%xmm5
        pslld   $19-10,%xmm2
        pand    %xmm3,%xmm4
        pxor    %xmm7,%xmm1


        psrld   $22-13,%xmm7
        pxor    %xmm2,%xmm1
        movdqa  %xmm9,%xmm15
        pslld   $30-19,%xmm2
        pxor    %xmm1,%xmm7
        pxor    %xmm4,%xmm15
        paddd   %xmm5,%xmm11
        pxor    %xmm2,%xmm7

        paddd   %xmm5,%xmm15
        paddd   %xmm7,%xmm15
        movdqa  32-128(%rax),%xmm5
        paddd   160-128(%rax),%xmm6

        movdqa  %xmm5,%xmm7
        movdqa  %xmm5,%xmm1
        psrld   $3,%xmm7
        movdqa  %xmm5,%xmm2

        psrld   $7,%xmm1
        movdqa  240-128(%rax),%xmm0
        pslld   $14,%xmm2
        pxor    %xmm1,%xmm7
        psrld   $18-7,%xmm1
        movdqa  %xmm0,%xmm4
        pxor    %xmm2,%xmm7
        pslld   $25-14,%xmm2
        pxor    %xmm1,%xmm7
        psrld   $10,%xmm0
        movdqa  %xmm4,%xmm1

        psrld   $17,%xmm4
        pxor    %xmm2,%xmm7
        pslld   $13,%xmm1
        paddd   %xmm7,%xmm6
        pxor    %xmm4,%xmm0
        psrld   $19-17,%xmm4
        pxor    %xmm1,%xmm0
        pslld   $15-13,%xmm1
        pxor    %xmm4,%xmm0
        pxor    %xmm1,%xmm0
        paddd   %xmm0,%xmm6
        movdqa  %xmm11,%xmm7

        movdqa  %xmm11,%xmm2

        psrld   $6,%xmm7
        movdqa  %xmm11,%xmm1
        pslld   $7,%xmm2
        movdqa  %xmm6,16-128(%rax)
        paddd   %xmm14,%xmm6

        psrld   $11,%xmm1
        pxor    %xmm2,%xmm7
        pslld   $21-7,%xmm2
        paddd   -96(%rbp),%xmm6
        pxor    %xmm1,%xmm7

        psrld   $25-11,%xmm1
        movdqa  %xmm11,%xmm0

        pxor    %xmm2,%xmm7
        movdqa  %xmm11,%xmm4
        pslld   $26-21,%xmm2
        pandn   %xmm13,%xmm0
        pand    %xmm12,%xmm4
        pxor    %xmm1,%xmm7


        movdqa  %xmm15,%xmm1
        pxor    %xmm2,%xmm7
        movdqa  %xmm15,%xmm2
        psrld   $2,%xmm1
        paddd   %xmm7,%xmm6
        pxor    %xmm4,%xmm0
        movdqa  %xmm8,%xmm4
        movdqa  %xmm15,%xmm7
        pslld   $10,%xmm2
        pxor    %xmm15,%xmm4


        psrld   $13,%xmm7
        pxor    %xmm2,%xmm1
        paddd   %xmm0,%xmm6
        pslld   $19-10,%xmm2
        pand    %xmm4,%xmm3
        pxor    %xmm7,%xmm1


        psrld   $22-13,%xmm7
        pxor    %xmm2,%xmm1
        movdqa  %xmm8,%xmm14
        pslld   $30-19,%xmm2
        pxor    %xmm1,%xmm7
        pxor    %xmm3,%xmm14
        paddd   %xmm6,%xmm10
        pxor    %xmm2,%xmm7

        paddd   %xmm6,%xmm14
        paddd   %xmm7,%xmm14
        movdqa  48-128(%rax),%xmm6
        paddd   176-128(%rax),%xmm5

        movdqa  %xmm6,%xmm7
        movdqa  %xmm6,%xmm1
        psrld   $3,%xmm7
        movdqa  %xmm6,%xmm2

        psrld   $7,%xmm1
        movdqa  0-128(%rax),%xmm0
        pslld   $14,%xmm2
        pxor    %xmm1,%xmm7
        psrld   $18-7,%xmm1
        movdqa  %xmm0,%xmm3
        pxor    %xmm2,%xmm7
        pslld   $25-14,%xmm2
        pxor    %xmm1,%xmm7
        psrld   $10,%xmm0
        movdqa  %xmm3,%xmm1

        psrld   $17,%xmm3
        pxor    %xmm2,%xmm7
        pslld   $13,%xmm1
        paddd   %xmm7,%xmm5
        pxor    %xmm3,%xmm0
        psrld   $19-17,%xmm3
        pxor    %xmm1,%xmm0
        pslld   $15-13,%xmm1
        pxor    %xmm3,%xmm0
        pxor    %xmm1,%xmm0
        paddd   %xmm0,%xmm5
        movdqa  %xmm10,%xmm7

        movdqa  %xmm10,%xmm2

        psrld   $6,%xmm7
        movdqa  %xmm10,%xmm1
        pslld   $7,%xmm2
        movdqa  %xmm5,32-128(%rax)
        paddd   %xmm13,%xmm5

        psrld   $11,%xmm1
        pxor    %xmm2,%xmm7
        pslld   $21-7,%xmm2
        paddd   -64(%rbp),%xmm5
        pxor    %xmm1,%xmm7

        psrld   $25-11,%xmm1
        movdqa  %xmm10,%xmm0

        pxor    %xmm2,%xmm7
        movdqa  %xmm10,%xmm3
        pslld   $26-21,%xmm2
        pandn   %xmm12,%xmm0
        pand    %xmm11,%xmm3
        pxor    %xmm1,%xmm7


        movdqa  %xmm14,%xmm1
        pxor    %xmm2,%xmm7
        movdqa  %xmm14,%xmm2
        psrld   $2,%xmm1
        paddd   %xmm7,%xmm5
        pxor    %xmm3,%xmm0
        movdqa  %xmm15,%xmm3
        movdqa  %xmm14,%xmm7
        pslld   $10,%xmm2
        pxor    %xmm14,%xmm3


        psrld   $13,%xmm7
        pxor    %xmm2,%xmm1
        paddd   %xmm0,%xmm5
        pslld   $19-10,%xmm2
        pand    %xmm3,%xmm4
        pxor    %xmm7,%xmm1


        psrld   $22-13,%xmm7
        pxor    %xmm2,%xmm1
        movdqa  %xmm15,%xmm13
        pslld   $30-19,%xmm2
        pxor    %xmm1,%xmm7
        pxor    %xmm4,%xmm13
        paddd   %xmm5,%xmm9
        pxor    %xmm2,%xmm7

        paddd   %xmm5,%xmm13
        paddd   %xmm7,%xmm13
        movdqa  64-128(%rax),%xmm5
        paddd   192-128(%rax),%xmm6

        movdqa  %xmm5,%xmm7
        movdqa  %xmm5,%xmm1
        psrld   $3,%xmm7
        movdqa  %xmm5,%xmm2

        psrld   $7,%xmm1
        movdqa  16-128(%rax),%xmm0
        pslld   $14,%xmm2
        pxor    %xmm1,%xmm7
        psrld   $18-7,%xmm1
        movdqa  %xmm0,%xmm4
        pxor    %xmm2,%xmm7
        pslld   $25-14,%xmm2
        pxor    %xmm1,%xmm7
        psrld   $10,%xmm0
        movdqa  %xmm4,%xmm1

        psrld   $17,%xmm4
        pxor    %xmm2,%xmm7
        pslld   $13,%xmm1
        paddd   %xmm7,%xmm6
        pxor    %xmm4,%xmm0
        psrld   $19-17,%xmm4
        pxor    %xmm1,%xmm0
        pslld   $15-13,%xmm1
        pxor    %xmm4,%xmm0
        pxor    %xmm1,%xmm0
        paddd   %xmm0,%xmm6
        movdqa  %xmm9,%xmm7

        movdqa  %xmm9,%xmm2

        psrld   $6,%xmm7
        movdqa  %xmm9,%xmm1
        pslld   $7,%xmm2
        movdqa  %xmm6,48-128(%rax)
        paddd   %xmm12,%xmm6

        psrld   $11,%xmm1
        pxor    %xmm2,%xmm7
        pslld   $21-7,%xmm2
        paddd   -32(%rbp),%xmm6
        pxor    %xmm1,%xmm7

        psrld   $25-11,%xmm1
        movdqa  %xmm9,%xmm0

        pxor    %xmm2,%xmm7
        movdqa  %xmm9,%xmm4
        pslld   $26-21,%xmm2
        pandn   %xmm11,%xmm0
        pand    %xmm10,%xmm4
        pxor    %xmm1,%xmm7


        movdqa  %xmm13,%xmm1
        pxor    %xmm2,%xmm7
        movdqa  %xmm13,%xmm2
        psrld   $2,%xmm1
        paddd   %xmm7,%xmm6
        pxor    %xmm4,%xmm0
        movdqa  %xmm14,%xmm4
        movdqa  %xmm13,%xmm7
        pslld   $10,%xmm2
        pxor    %xmm13,%xmm4


        psrld   $13,%xmm7
        pxor    %xmm2,%xmm1
        paddd   %xmm0,%xmm6
        pslld   $19-10,%xmm2
        pand    %xmm4,%xmm3
        pxor    %xmm7,%xmm1


        psrld   $22-13,%xmm7
        pxor    %xmm2,%xmm1
        movdqa  %xmm14,%xmm12
        pslld   $30-19,%xmm2
        pxor    %xmm1,%xmm7
        pxor    %xmm3,%xmm12
        paddd   %xmm6,%xmm8
        pxor    %xmm2,%xmm7

        paddd   %xmm6,%xmm12
        paddd   %xmm7,%xmm12
        movdqa  80-128(%rax),%xmm6
        paddd   208-128(%rax),%xmm5

        movdqa  %xmm6,%xmm7
        movdqa  %xmm6,%xmm1
        psrld   $3,%xmm7
        movdqa  %xmm6,%xmm2

        psrld   $7,%xmm1
        movdqa  32-128(%rax),%xmm0
        pslld   $14,%xmm2
        pxor    %xmm1,%xmm7
        psrld   $18-7,%xmm1
        movdqa  %xmm0,%xmm3
        pxor    %xmm2,%xmm7
        pslld   $25-14,%xmm2
        pxor    %xmm1,%xmm7
        psrld   $10,%xmm0
        movdqa  %xmm3,%xmm1

        psrld   $17,%xmm3
        pxor    %xmm2,%xmm7
        pslld   $13,%xmm1
        paddd   %xmm7,%xmm5
        pxor    %xmm3,%xmm0
        psrld   $19-17,%xmm3
        pxor    %xmm1,%xmm0
        pslld   $15-13,%xmm1
        pxor    %xmm3,%xmm0
        pxor    %xmm1,%xmm0
        paddd   %xmm0,%xmm5
        movdqa  %xmm8,%xmm7

        movdqa  %xmm8,%xmm2

        psrld   $6,%xmm7
        movdqa  %xmm8,%xmm1
        pslld   $7,%xmm2
        movdqa  %xmm5,64-128(%rax)
        paddd   %xmm11,%xmm5

        psrld   $11,%xmm1
        pxor    %xmm2,%xmm7
        pslld   $21-7,%xmm2
        paddd   0(%rbp),%xmm5
        pxor    %xmm1,%xmm7

        psrld   $25-11,%xmm1
        movdqa  %xmm8,%xmm0

        pxor    %xmm2,%xmm7
        movdqa  %xmm8,%xmm3
        pslld   $26-21,%xmm2
        pandn   %xmm10,%xmm0
        pand    %xmm9,%xmm3
        pxor    %xmm1,%xmm7


        movdqa  %xmm12,%xmm1
        pxor    %xmm2,%xmm7
        movdqa  %xmm12,%xmm2
        psrld   $2,%xmm1
        paddd   %xmm7,%xmm5
        pxor    %xmm3,%xmm0
        movdqa  %xmm13,%xmm3
        movdqa  %xmm12,%xmm7
        pslld   $10,%xmm2
        pxor    %xmm12,%xmm3


        psrld   $13,%xmm7
        pxor    %xmm2,%xmm1
        paddd   %xmm0,%xmm5
        pslld   $19-10,%xmm2
        pand    %xmm3,%xmm4
        pxor    %xmm7,%xmm1


        psrld   $22-13,%xmm7
        pxor    %xmm2,%xmm1
        movdqa  %xmm13,%xmm11
        pslld   $30-19,%xmm2
        pxor    %xmm1,%xmm7
        pxor    %xmm4,%xmm11
        paddd   %xmm5,%xmm15
        pxor    %xmm2,%xmm7

        paddd   %xmm5,%xmm11
        paddd   %xmm7,%xmm11
        movdqa  96-128(%rax),%xmm5
        paddd   224-128(%rax),%xmm6

        movdqa  %xmm5,%xmm7
        movdqa  %xmm5,%xmm1
        psrld   $3,%xmm7
        movdqa  %xmm5,%xmm2

        psrld   $7,%xmm1
        movdqa  48-128(%rax),%xmm0
        pslld   $14,%xmm2
        pxor    %xmm1,%xmm7
        psrld   $18-7,%xmm1
        movdqa  %xmm0,%xmm4
        pxor    %xmm2,%xmm7
        pslld   $25-14,%xmm2
        pxor    %xmm1,%xmm7
        psrld   $10,%xmm0
        movdqa  %xmm4,%xmm1

        psrld   $17,%xmm4
        pxor    %xmm2,%xmm7
        pslld   $13,%xmm1
        paddd   %xmm7,%xmm6
        pxor    %xmm4,%xmm0
        psrld   $19-17,%xmm4
        pxor    %xmm1,%xmm0
        pslld   $15-13,%xmm1
        pxor    %xmm4,%xmm0
        pxor    %xmm1,%xmm0
        paddd   %xmm0,%xmm6
        movdqa  %xmm15,%xmm7

        movdqa  %xmm15,%xmm2

        psrld   $6,%xmm7
        movdqa  %xmm15,%xmm1
        pslld   $7,%xmm2
        movdqa  %xmm6,80-128(%rax)
        paddd   %xmm10,%xmm6

        psrld   $11,%xmm1
        pxor    %xmm2,%xmm7
        pslld   $21-7,%xmm2
        paddd   32(%rbp),%xmm6
        pxor    %xmm1,%xmm7

        psrld   $25-11,%xmm1
        movdqa  %xmm15,%xmm0

        pxor    %xmm2,%xmm7
        movdqa  %xmm15,%xmm4
        pslld   $26-21,%xmm2
        pandn   %xmm9,%xmm0
        pand    %xmm8,%xmm4
        pxor    %xmm1,%xmm7


        movdqa  %xmm11,%xmm1
        pxor    %xmm2,%xmm7
        movdqa  %xmm11,%xmm2
        psrld   $2,%xmm1
        paddd   %xmm7,%xmm6
        pxor    %xmm4,%xmm0
        movdqa  %xmm12,%xmm4
        movdqa  %xmm11,%xmm7
        pslld   $10,%xmm2
        pxor    %xmm11,%xmm4


        psrld   $13,%xmm7
        pxor    %xmm2,%xmm1
        paddd   %xmm0,%xmm6
        pslld   $19-10,%xmm2
        pand    %xmm4,%xmm3
        pxor    %xmm7,%xmm1


        psrld   $22-13,%xmm7
        pxor    %xmm2,%xmm1
        movdqa  %xmm12,%xmm10
        pslld   $30-19,%xmm2
        pxor    %xmm1,%xmm7
        pxor    %xmm3,%xmm10
        paddd   %xmm6,%xmm14
        pxor    %xmm2,%xmm7

        paddd   %xmm6,%xmm10
        paddd   %xmm7,%xmm10
        movdqa  112-128(%rax),%xmm6
        paddd   240-128(%rax),%xmm5

        movdqa  %xmm6,%xmm7
        movdqa  %xmm6,%xmm1
        psrld   $3,%xmm7
        movdqa  %xmm6,%xmm2

        psrld   $7,%xmm1
        movdqa  64-128(%rax),%xmm0
        pslld   $14,%xmm2
        pxor    %xmm1,%xmm7
        psrld   $18-7,%xmm1
        movdqa  %xmm0,%xmm3
        pxor    %xmm2,%xmm7
        pslld   $25-14,%xmm2
        pxor    %xmm1,%xmm7
        psrld   $10,%xmm0
        movdqa  %xmm3,%xmm1

        psrld   $17,%xmm3
        pxor    %xmm2,%xmm7
        pslld   $13,%xmm1
        paddd   %xmm7,%xmm5
        pxor    %xmm3,%xmm0
        psrld   $19-17,%xmm3
        pxor    %xmm1,%xmm0
        pslld   $15-13,%xmm1
        pxor    %xmm3,%xmm0
        pxor    %xmm1,%xmm0
        paddd   %xmm0,%xmm5
        movdqa  %xmm14,%xmm7

        movdqa  %xmm14,%xmm2

        psrld   $6,%xmm7
        movdqa  %xmm14,%xmm1
        pslld   $7,%xmm2
        movdqa  %xmm5,96-128(%rax)
        paddd   %xmm9,%xmm5

        psrld   $11,%xmm1
        pxor    %xmm2,%xmm7
        pslld   $21-7,%xmm2
        paddd   64(%rbp),%xmm5
        pxor    %xmm1,%xmm7

        psrld   $25-11,%xmm1
        movdqa  %xmm14,%xmm0

        pxor    %xmm2,%xmm7
        movdqa  %xmm14,%xmm3
        pslld   $26-21,%xmm2
        pandn   %xmm8,%xmm0
        pand    %xmm15,%xmm3
        pxor    %xmm1,%xmm7


        movdqa  %xmm10,%xmm1
        pxor    %xmm2,%xmm7
        movdqa  %xmm10,%xmm2
        psrld   $2,%xmm1
        paddd   %xmm7,%xmm5
        pxor    %xmm3,%xmm0
        movdqa  %xmm11,%xmm3
        movdqa  %xmm10,%xmm7
        pslld   $10,%xmm2
        pxor    %xmm10,%xmm3


        psrld   $13,%xmm7
        pxor    %xmm2,%xmm1
        paddd   %xmm0,%xmm5
        pslld   $19-10,%xmm2
        pand    %xmm3,%xmm4
        pxor    %xmm7,%xmm1


        psrld   $22-13,%xmm7
        pxor    %xmm2,%xmm1
        movdqa  %xmm11,%xmm9
        pslld   $30-19,%xmm2
        pxor    %xmm1,%xmm7
        pxor    %xmm4,%xmm9
        paddd   %xmm5,%xmm13
        pxor    %xmm2,%xmm7

        paddd   %xmm5,%xmm9
        paddd   %xmm7,%xmm9
        movdqa  128-128(%rax),%xmm5
        paddd   0-128(%rax),%xmm6

        movdqa  %xmm5,%xmm7
        movdqa  %xmm5,%xmm1
        psrld   $3,%xmm7
        movdqa  %xmm5,%xmm2

        psrld   $7,%xmm1
        movdqa  80-128(%rax),%xmm0
        pslld   $14,%xmm2
        pxor    %xmm1,%xmm7
        psrld   $18-7,%xmm1
        movdqa  %xmm0,%xmm4
        pxor    %xmm2,%xmm7
        pslld   $25-14,%xmm2
        pxor    %xmm1,%xmm7
        psrld   $10,%xmm0
        movdqa  %xmm4,%xmm1

        psrld   $17,%xmm4
        pxor    %xmm2,%xmm7
        pslld   $13,%xmm1
        paddd   %xmm7,%xmm6
        pxor    %xmm4,%xmm0
        psrld   $19-17,%xmm4
        pxor    %xmm1,%xmm0
        pslld   $15-13,%xmm1
        pxor    %xmm4,%xmm0
        pxor    %xmm1,%xmm0
        paddd   %xmm0,%xmm6
        movdqa  %xmm13,%xmm7

        movdqa  %xmm13,%xmm2

        psrld   $6,%xmm7
        movdqa  %xmm13,%xmm1
        pslld   $7,%xmm2
        movdqa  %xmm6,112-128(%rax)
        paddd   %xmm8,%xmm6

        psrld   $11,%xmm1
        pxor    %xmm2,%xmm7
        pslld   $21-7,%xmm2
        paddd   96(%rbp),%xmm6
        pxor    %xmm1,%xmm7

        psrld   $25-11,%xmm1
        movdqa  %xmm13,%xmm0

        pxor    %xmm2,%xmm7
        movdqa  %xmm13,%xmm4
        pslld   $26-21,%xmm2
        pandn   %xmm15,%xmm0
        pand    %xmm14,%xmm4
        pxor    %xmm1,%xmm7


        movdqa  %xmm9,%xmm1
        pxor    %xmm2,%xmm7
        movdqa  %xmm9,%xmm2
        psrld   $2,%xmm1
        paddd   %xmm7,%xmm6
        pxor    %xmm4,%xmm0
        movdqa  %xmm10,%xmm4
        movdqa  %xmm9,%xmm7
        pslld   $10,%xmm2
        pxor    %xmm9,%xmm4


        psrld   $13,%xmm7
        pxor    %xmm2,%xmm1
        paddd   %xmm0,%xmm6
        pslld   $19-10,%xmm2
        pand    %xmm4,%xmm3
        pxor    %xmm7,%xmm1


        psrld   $22-13,%xmm7
        pxor    %xmm2,%xmm1
        movdqa  %xmm10,%xmm8
        pslld   $30-19,%xmm2
        pxor    %xmm1,%xmm7
        pxor    %xmm3,%xmm8
        paddd   %xmm6,%xmm12
        pxor    %xmm2,%xmm7

        paddd   %xmm6,%xmm8
        paddd   %xmm7,%xmm8
        leaq    256(%rbp),%rbp
        movdqa  144-128(%rax),%xmm6
        paddd   16-128(%rax),%xmm5

        movdqa  %xmm6,%xmm7
        movdqa  %xmm6,%xmm1
        psrld   $3,%xmm7
        movdqa  %xmm6,%xmm2

        psrld   $7,%xmm1
        movdqa  96-128(%rax),%xmm0
        pslld   $14,%xmm2
        pxor    %xmm1,%xmm7
        psrld   $18-7,%xmm1
        movdqa  %xmm0,%xmm3
        pxor    %xmm2,%xmm7
        pslld   $25-14,%xmm2
        pxor    %xmm1,%xmm7
        psrld   $10,%xmm0
        movdqa  %xmm3,%xmm1

        psrld   $17,%xmm3
        pxor    %xmm2,%xmm7
        pslld   $13,%xmm1
        paddd   %xmm7,%xmm5
        pxor    %xmm3,%xmm0
        psrld   $19-17,%xmm3
        pxor    %xmm1,%xmm0
        pslld   $15-13,%xmm1
        pxor    %xmm3,%xmm0
        pxor    %xmm1,%xmm0
        paddd   %xmm0,%xmm5
        movdqa  %xmm12,%xmm7

        movdqa  %xmm12,%xmm2

        psrld   $6,%xmm7
        movdqa  %xmm12,%xmm1
        pslld   $7,%xmm2
        movdqa  %xmm5,128-128(%rax)
        paddd   %xmm15,%xmm5

        psrld   $11,%xmm1
        pxor    %xmm2,%xmm7
        pslld   $21-7,%xmm2
        paddd   -128(%rbp),%xmm5
        pxor    %xmm1,%xmm7

        psrld   $25-11,%xmm1
        movdqa  %xmm12,%xmm0

        pxor    %xmm2,%xmm7
        movdqa  %xmm12,%xmm3
        pslld   $26-21,%xmm2
        pandn   %xmm14,%xmm0
        pand    %xmm13,%xmm3
        pxor    %xmm1,%xmm7


        movdqa  %xmm8,%xmm1
        pxor    %xmm2,%xmm7
        movdqa  %xmm8,%xmm2
        psrld   $2,%xmm1
        paddd   %xmm7,%xmm5
        pxor    %xmm3,%xmm0
        movdqa  %xmm9,%xmm3
        movdqa  %xmm8,%xmm7
        pslld   $10,%xmm2
        pxor    %xmm8,%xmm3


        psrld   $13,%xmm7
        pxor    %xmm2,%xmm1
        paddd   %xmm0,%xmm5
        pslld   $19-10,%xmm2
        pand    %xmm3,%xmm4
        pxor    %xmm7,%xmm1


        psrld   $22-13,%xmm7
        pxor    %xmm2,%xmm1
        movdqa  %xmm9,%xmm15
        pslld   $30-19,%xmm2
        pxor    %xmm1,%xmm7
        pxor    %xmm4,%xmm15
        paddd   %xmm5,%xmm11
        pxor    %xmm2,%xmm7

        paddd   %xmm5,%xmm15
        paddd   %xmm7,%xmm15
        movdqa  160-128(%rax),%xmm5
        paddd   32-128(%rax),%xmm6

        movdqa  %xmm5,%xmm7
        movdqa  %xmm5,%xmm1
        psrld   $3,%xmm7
        movdqa  %xmm5,%xmm2

        psrld   $7,%xmm1
        movdqa  112-128(%rax),%xmm0
        pslld   $14,%xmm2
        pxor    %xmm1,%xmm7
        psrld   $18-7,%xmm1
        movdqa  %xmm0,%xmm4
        pxor    %xmm2,%xmm7
        pslld   $25-14,%xmm2
        pxor    %xmm1,%xmm7
        psrld   $10,%xmm0
        movdqa  %xmm4,%xmm1

        psrld   $17,%xmm4
        pxor    %xmm2,%xmm7
        pslld   $13,%xmm1
        paddd   %xmm7,%xmm6
        pxor    %xmm4,%xmm0
        psrld   $19-17,%xmm4
        pxor    %xmm1,%xmm0
        pslld   $15-13,%xmm1
        pxor    %xmm4,%xmm0
        pxor    %xmm1,%xmm0
        paddd   %xmm0,%xmm6
        movdqa  %xmm11,%xmm7

        movdqa  %xmm11,%xmm2

        psrld   $6,%xmm7
        movdqa  %xmm11,%xmm1
        pslld   $7,%xmm2
        movdqa  %xmm6,144-128(%rax)
        paddd   %xmm14,%xmm6

        psrld   $11,%xmm1
        pxor    %xmm2,%xmm7
        pslld   $21-7,%xmm2
        paddd   -96(%rbp),%xmm6
        pxor    %xmm1,%xmm7

        psrld   $25-11,%xmm1
        movdqa  %xmm11,%xmm0

        pxor    %xmm2,%xmm7
        movdqa  %xmm11,%xmm4
        pslld   $26-21,%xmm2
        pandn   %xmm13,%xmm0
        pand    %xmm12,%xmm4
        pxor    %xmm1,%xmm7


        movdqa  %xmm15,%xmm1
        pxor    %xmm2,%xmm7
        movdqa  %xmm15,%xmm2
        psrld   $2,%xmm1
        paddd   %xmm7,%xmm6
        pxor    %xmm4,%xmm0
        movdqa  %xmm8,%xmm4
        movdqa  %xmm15,%xmm7
        pslld   $10,%xmm2
        pxor    %xmm15,%xmm4


        psrld   $13,%xmm7
        pxor    %xmm2,%xmm1
        paddd   %xmm0,%xmm6
        pslld   $19-10,%xmm2
        pand    %xmm4,%xmm3
        pxor    %xmm7,%xmm1


        psrld   $22-13,%xmm7
        pxor    %xmm2,%xmm1
        movdqa  %xmm8,%xmm14
        pslld   $30-19,%xmm2
        pxor    %xmm1,%xmm7
        pxor    %xmm3,%xmm14
        paddd   %xmm6,%xmm10
        pxor    %xmm2,%xmm7

        paddd   %xmm6,%xmm14
        paddd   %xmm7,%xmm14
        movdqa  176-128(%rax),%xmm6
        paddd   48-128(%rax),%xmm5

        movdqa  %xmm6,%xmm7
        movdqa  %xmm6,%xmm1
        psrld   $3,%xmm7
        movdqa  %xmm6,%xmm2

        psrld   $7,%xmm1
        movdqa  128-128(%rax),%xmm0
        pslld   $14,%xmm2
        pxor    %xmm1,%xmm7
        psrld   $18-7,%xmm1
        movdqa  %xmm0,%xmm3
        pxor    %xmm2,%xmm7
        pslld   $25-14,%xmm2
        pxor    %xmm1,%xmm7
        psrld   $10,%xmm0
        movdqa  %xmm3,%xmm1

        psrld   $17,%xmm3
        pxor    %xmm2,%xmm7
        pslld   $13,%xmm1
        paddd   %xmm7,%xmm5
        pxor    %xmm3,%xmm0
        psrld   $19-17,%xmm3
        pxor    %xmm1,%xmm0
        pslld   $15-13,%xmm1
        pxor    %xmm3,%xmm0
        pxor    %xmm1,%xmm0
        paddd   %xmm0,%xmm5
        movdqa  %xmm10,%xmm7

        movdqa  %xmm10,%xmm2

        psrld   $6,%xmm7
        movdqa  %xmm10,%xmm1
        pslld   $7,%xmm2
        movdqa  %xmm5,160-128(%rax)
        paddd   %xmm13,%xmm5

        psrld   $11,%xmm1
        pxor    %xmm2,%xmm7
        pslld   $21-7,%xmm2
        paddd   -64(%rbp),%xmm5
        pxor    %xmm1,%xmm7

        psrld   $25-11,%xmm1
        movdqa  %xmm10,%xmm0

        pxor    %xmm2,%xmm7
        movdqa  %xmm10,%xmm3
        pslld   $26-21,%xmm2
        pandn   %xmm12,%xmm0
        pand    %xmm11,%xmm3
        pxor    %xmm1,%xmm7


        movdqa  %xmm14,%xmm1
        pxor    %xmm2,%xmm7
        movdqa  %xmm14,%xmm2
        psrld   $2,%xmm1
        paddd   %xmm7,%xmm5
        pxor    %xmm3,%xmm0
        movdqa  %xmm15,%xmm3
        movdqa  %xmm14,%xmm7
        pslld   $10,%xmm2
        pxor    %xmm14,%xmm3


        psrld   $13,%xmm7
        pxor    %xmm2,%xmm1
        paddd   %xmm0,%xmm5
        pslld   $19-10,%xmm2
        pand    %xmm3,%xmm4
        pxor    %xmm7,%xmm1


        psrld   $22-13,%xmm7
        pxor    %xmm2,%xmm1
        movdqa  %xmm15,%xmm13
        pslld   $30-19,%xmm2
        pxor    %xmm1,%xmm7
        pxor    %xmm4,%xmm13
        paddd   %xmm5,%xmm9
        pxor    %xmm2,%xmm7

        paddd   %xmm5,%xmm13
        paddd   %xmm7,%xmm13
        movdqa  192-128(%rax),%xmm5
        paddd   64-128(%rax),%xmm6

        movdqa  %xmm5,%xmm7
        movdqa  %xmm5,%xmm1
        psrld   $3,%xmm7
        movdqa  %xmm5,%xmm2

        psrld   $7,%xmm1
        movdqa  144-128(%rax),%xmm0
        pslld   $14,%xmm2
        pxor    %xmm1,%xmm7
        psrld   $18-7,%xmm1
        movdqa  %xmm0,%xmm4
        pxor    %xmm2,%xmm7
        pslld   $25-14,%xmm2
        pxor    %xmm1,%xmm7
        psrld   $10,%xmm0
        movdqa  %xmm4,%xmm1

        psrld   $17,%xmm4
        pxor    %xmm2,%xmm7
        pslld   $13,%xmm1
        paddd   %xmm7,%xmm6
        pxor    %xmm4,%xmm0
        psrld   $19-17,%xmm4
        pxor    %xmm1,%xmm0
        pslld   $15-13,%xmm1
        pxor    %xmm4,%xmm0
        pxor    %xmm1,%xmm0
        paddd   %xmm0,%xmm6
        movdqa  %xmm9,%xmm7

        movdqa  %xmm9,%xmm2

        psrld   $6,%xmm7
        movdqa  %xmm9,%xmm1
        pslld   $7,%xmm2
        movdqa  %xmm6,176-128(%rax)
        paddd   %xmm12,%xmm6

        psrld   $11,%xmm1
        pxor    %xmm2,%xmm7
        pslld   $21-7,%xmm2
        paddd   -32(%rbp),%xmm6
        pxor    %xmm1,%xmm7

        psrld   $25-11,%xmm1
        movdqa  %xmm9,%xmm0

        pxor    %xmm2,%xmm7
        movdqa  %xmm9,%xmm4
        pslld   $26-21,%xmm2
        pandn   %xmm11,%xmm0
        pand    %xmm10,%xmm4
        pxor    %xmm1,%xmm7


        movdqa  %xmm13,%xmm1
        pxor    %xmm2,%xmm7
        movdqa  %xmm13,%xmm2
        psrld   $2,%xmm1
        paddd   %xmm7,%xmm6
        pxor    %xmm4,%xmm0
        movdqa  %xmm14,%xmm4
        movdqa  %xmm13,%xmm7
        pslld   $10,%xmm2
        pxor    %xmm13,%xmm4


        psrld   $13,%xmm7
        pxor    %xmm2,%xmm1
        paddd   %xmm0,%xmm6
        pslld   $19-10,%xmm2
        pand    %xmm4,%xmm3
        pxor    %xmm7,%xmm1


        psrld   $22-13,%xmm7
        pxor    %xmm2,%xmm1
        movdqa  %xmm14,%xmm12
        pslld   $30-19,%xmm2
        pxor    %xmm1,%xmm7
        pxor    %xmm3,%xmm12
        paddd   %xmm6,%xmm8
        pxor    %xmm2,%xmm7

        paddd   %xmm6,%xmm12
        paddd   %xmm7,%xmm12
        movdqa  208-128(%rax),%xmm6
        paddd   80-128(%rax),%xmm5

        movdqa  %xmm6,%xmm7
        movdqa  %xmm6,%xmm1
        psrld   $3,%xmm7
        movdqa  %xmm6,%xmm2

        psrld   $7,%xmm1
        movdqa  160-128(%rax),%xmm0
        pslld   $14,%xmm2
        pxor    %xmm1,%xmm7
        psrld   $18-7,%xmm1
        movdqa  %xmm0,%xmm3
        pxor    %xmm2,%xmm7
        pslld   $25-14,%xmm2
        pxor    %xmm1,%xmm7
        psrld   $10,%xmm0
        movdqa  %xmm3,%xmm1

        psrld   $17,%xmm3
        pxor    %xmm2,%xmm7
        pslld   $13,%xmm1
        paddd   %xmm7,%xmm5
        pxor    %xmm3,%xmm0
        psrld   $19-17,%xmm3
        pxor    %xmm1,%xmm0
        pslld   $15-13,%xmm1
        pxor    %xmm3,%xmm0
        pxor    %xmm1,%xmm0
        paddd   %xmm0,%xmm5
        movdqa  %xmm8,%xmm7

        movdqa  %xmm8,%xmm2

        psrld   $6,%xmm7
        movdqa  %xmm8,%xmm1
        pslld   $7,%xmm2
        movdqa  %xmm5,192-128(%rax)
        paddd   %xmm11,%xmm5

        psrld   $11,%xmm1
        pxor    %xmm2,%xmm7
        pslld   $21-7,%xmm2
        paddd   0(%rbp),%xmm5
        pxor    %xmm1,%xmm7

        psrld   $25-11,%xmm1
        movdqa  %xmm8,%xmm0

        pxor    %xmm2,%xmm7
        movdqa  %xmm8,%xmm3
        pslld   $26-21,%xmm2
        pandn   %xmm10,%xmm0
        pand    %xmm9,%xmm3
        pxor    %xmm1,%xmm7


        movdqa  %xmm12,%xmm1
        pxor    %xmm2,%xmm7
        movdqa  %xmm12,%xmm2
        psrld   $2,%xmm1
        paddd   %xmm7,%xmm5
        pxor    %xmm3,%xmm0
        movdqa  %xmm13,%xmm3
        movdqa  %xmm12,%xmm7
        pslld   $10,%xmm2
        pxor    %xmm12,%xmm3


        psrld   $13,%xmm7
        pxor    %xmm2,%xmm1
        paddd   %xmm0,%xmm5
        pslld   $19-10,%xmm2
        pand    %xmm3,%xmm4
        pxor    %xmm7,%xmm1


        psrld   $22-13,%xmm7
        pxor    %xmm2,%xmm1
        movdqa  %xmm13,%xmm11
        pslld   $30-19,%xmm2
        pxor    %xmm1,%xmm7
        pxor    %xmm4,%xmm11
        paddd   %xmm5,%xmm15
        pxor    %xmm2,%xmm7

        paddd   %xmm5,%xmm11
        paddd   %xmm7,%xmm11
        movdqa  224-128(%rax),%xmm5
        paddd   96-128(%rax),%xmm6

        movdqa  %xmm5,%xmm7
        movdqa  %xmm5,%xmm1
        psrld   $3,%xmm7
        movdqa  %xmm5,%xmm2

        psrld   $7,%xmm1
        movdqa  176-128(%rax),%xmm0
        pslld   $14,%xmm2
        pxor    %xmm1,%xmm7
        psrld   $18-7,%xmm1
        movdqa  %xmm0,%xmm4
        pxor    %xmm2,%xmm7
        pslld   $25-14,%xmm2
        pxor    %xmm1,%xmm7
        psrld   $10,%xmm0
        movdqa  %xmm4,%xmm1

        psrld   $17,%xmm4
        pxor    %xmm2,%xmm7
        pslld   $13,%xmm1
        paddd   %xmm7,%xmm6
        pxor    %xmm4,%xmm0
        psrld   $19-17,%xmm4
        pxor    %xmm1,%xmm0
        pslld   $15-13,%xmm1
        pxor    %xmm4,%xmm0
        pxor    %xmm1,%xmm0
        paddd   %xmm0,%xmm6
        movdqa  %xmm15,%xmm7

        movdqa  %xmm15,%xmm2

        psrld   $6,%xmm7
        movdqa  %xmm15,%xmm1
        pslld   $7,%xmm2
        movdqa  %xmm6,208-128(%rax)
        paddd   %xmm10,%xmm6

        psrld   $11,%xmm1
        pxor    %xmm2,%xmm7
        pslld   $21-7,%xmm2
        paddd   32(%rbp),%xmm6
        pxor    %xmm1,%xmm7

        psrld   $25-11,%xmm1
        movdqa  %xmm15,%xmm0

        pxor    %xmm2,%xmm7
        movdqa  %xmm15,%xmm4
        pslld   $26-21,%xmm2
        pandn   %xmm9,%xmm0
        pand    %xmm8,%xmm4
        pxor    %xmm1,%xmm7


        movdqa  %xmm11,%xmm1
        pxor    %xmm2,%xmm7
        movdqa  %xmm11,%xmm2
        psrld   $2,%xmm1
        paddd   %xmm7,%xmm6
        pxor    %xmm4,%xmm0
        movdqa  %xmm12,%xmm4
        movdqa  %xmm11,%xmm7
        pslld   $10,%xmm2
        pxor    %xmm11,%xmm4


        psrld   $13,%xmm7
        pxor    %xmm2,%xmm1
        paddd   %xmm0,%xmm6
        pslld   $19-10,%xmm2
        pand    %xmm4,%xmm3
        pxor    %xmm7,%xmm1


        psrld   $22-13,%xmm7
        pxor    %xmm2,%xmm1
        movdqa  %xmm12,%xmm10
        pslld   $30-19,%xmm2
        pxor    %xmm1,%xmm7
        pxor    %xmm3,%xmm10
        paddd   %xmm6,%xmm14
        pxor    %xmm2,%xmm7

        paddd   %xmm6,%xmm10
        paddd   %xmm7,%xmm10
        movdqa  240-128(%rax),%xmm6
        paddd   112-128(%rax),%xmm5

        movdqa  %xmm6,%xmm7
        movdqa  %xmm6,%xmm1
        psrld   $3,%xmm7
        movdqa  %xmm6,%xmm2

        psrld   $7,%xmm1
        movdqa  192-128(%rax),%xmm0
        pslld   $14,%xmm2
        pxor    %xmm1,%xmm7
        psrld   $18-7,%xmm1
        movdqa  %xmm0,%xmm3
        pxor    %xmm2,%xmm7
        pslld   $25-14,%xmm2
        pxor    %xmm1,%xmm7
        psrld   $10,%xmm0
        movdqa  %xmm3,%xmm1

        psrld   $17,%xmm3
        pxor    %xmm2,%xmm7
        pslld   $13,%xmm1
        paddd   %xmm7,%xmm5
        pxor    %xmm3,%xmm0
        psrld   $19-17,%xmm3
        pxor    %xmm1,%xmm0
        pslld   $15-13,%xmm1
        pxor    %xmm3,%xmm0
        pxor    %xmm1,%xmm0
        paddd   %xmm0,%xmm5
        movdqa  %xmm14,%xmm7

        movdqa  %xmm14,%xmm2

        psrld   $6,%xmm7
        movdqa  %xmm14,%xmm1
        pslld   $7,%xmm2
        movdqa  %xmm5,224-128(%rax)
        paddd   %xmm9,%xmm5

        psrld   $11,%xmm1
        pxor    %xmm2,%xmm7
        pslld   $21-7,%xmm2
        paddd   64(%rbp),%xmm5
        pxor    %xmm1,%xmm7

        psrld   $25-11,%xmm1
        movdqa  %xmm14,%xmm0

        pxor    %xmm2,%xmm7
        movdqa  %xmm14,%xmm3
        pslld   $26-21,%xmm2
        pandn   %xmm8,%xmm0
        pand    %xmm15,%xmm3
        pxor    %xmm1,%xmm7


        movdqa  %xmm10,%xmm1
        pxor    %xmm2,%xmm7
        movdqa  %xmm10,%xmm2
        psrld   $2,%xmm1
        paddd   %xmm7,%xmm5
        pxor    %xmm3,%xmm0
        movdqa  %xmm11,%xmm3
        movdqa  %xmm10,%xmm7
        pslld   $10,%xmm2
        pxor    %xmm10,%xmm3


        psrld   $13,%xmm7
        pxor    %xmm2,%xmm1
        paddd   %xmm0,%xmm5
        pslld   $19-10,%xmm2
        pand    %xmm3,%xmm4
        pxor    %xmm7,%xmm1


        psrld   $22-13,%xmm7
        pxor    %xmm2,%xmm1
        movdqa  %xmm11,%xmm9
        pslld   $30-19,%xmm2
        pxor    %xmm1,%xmm7
        pxor    %xmm4,%xmm9
        paddd   %xmm5,%xmm13
        pxor    %xmm2,%xmm7

        paddd   %xmm5,%xmm9
        paddd   %xmm7,%xmm9
        movdqa  0-128(%rax),%xmm5
        paddd   128-128(%rax),%xmm6

        movdqa  %xmm5,%xmm7
        movdqa  %xmm5,%xmm1
        psrld   $3,%xmm7
        movdqa  %xmm5,%xmm2

        psrld   $7,%xmm1
        movdqa  208-128(%rax),%xmm0
        pslld   $14,%xmm2
        pxor    %xmm1,%xmm7
        psrld   $18-7,%xmm1
        movdqa  %xmm0,%xmm4
        pxor    %xmm2,%xmm7
        pslld   $25-14,%xmm2
        pxor    %xmm1,%xmm7
        psrld   $10,%xmm0
        movdqa  %xmm4,%xmm1

        psrld   $17,%xmm4
        pxor    %xmm2,%xmm7
        pslld   $13,%xmm1
        paddd   %xmm7,%xmm6
        pxor    %xmm4,%xmm0
        psrld   $19-17,%xmm4
        pxor    %xmm1,%xmm0
        pslld   $15-13,%xmm1
        pxor    %xmm4,%xmm0
        pxor    %xmm1,%xmm0
        paddd   %xmm0,%xmm6
        movdqa  %xmm13,%xmm7

        movdqa  %xmm13,%xmm2

        psrld   $6,%xmm7
        movdqa  %xmm13,%xmm1
        pslld   $7,%xmm2
        movdqa  %xmm6,240-128(%rax)
        paddd   %xmm8,%xmm6

        psrld   $11,%xmm1
        pxor    %xmm2,%xmm7
        pslld   $21-7,%xmm2
        paddd   96(%rbp),%xmm6
        pxor    %xmm1,%xmm7

        psrld   $25-11,%xmm1
        movdqa  %xmm13,%xmm0

        pxor    %xmm2,%xmm7
        movdqa  %xmm13,%xmm4
        pslld   $26-21,%xmm2
        pandn   %xmm15,%xmm0
        pand    %xmm14,%xmm4
        pxor    %xmm1,%xmm7


        movdqa  %xmm9,%xmm1
        pxor    %xmm2,%xmm7
        movdqa  %xmm9,%xmm2
        psrld   $2,%xmm1
        paddd   %xmm7,%xmm6
        pxor    %xmm4,%xmm0
        movdqa  %xmm10,%xmm4
        movdqa  %xmm9,%xmm7
        pslld   $10,%xmm2
        pxor    %xmm9,%xmm4


        psrld   $13,%xmm7
        pxor    %xmm2,%xmm1
        paddd   %xmm0,%xmm6
        pslld   $19-10,%xmm2
        pand    %xmm4,%xmm3
        pxor    %xmm7,%xmm1


        psrld   $22-13,%xmm7
        pxor    %xmm2,%xmm1
        movdqa  %xmm10,%xmm8
        pslld   $30-19,%xmm2
        pxor    %xmm1,%xmm7
        pxor    %xmm3,%xmm8
        paddd   %xmm6,%xmm12
        pxor    %xmm2,%xmm7

        paddd   %xmm6,%xmm8
        paddd   %xmm7,%xmm8
        leaq    256(%rbp),%rbp
        decl    %ecx
        jnz     .Loop_16_xx

        movl    $1,%ecx
        leaq    K256+128(%rip),%rbp

        movdqa  (%rbx),%xmm7
        cmpl    0(%rbx),%ecx
        pxor    %xmm0,%xmm0
        cmovgeq %rbp,%r8
        cmpl    4(%rbx),%ecx
        movdqa  %xmm7,%xmm6
        cmovgeq %rbp,%r9
        cmpl    8(%rbx),%ecx
        pcmpgtd %xmm0,%xmm6
        cmovgeq %rbp,%r10
        cmpl    12(%rbx),%ecx
        paddd   %xmm6,%xmm7
        cmovgeq %rbp,%r11

        movdqu  0-128(%rdi),%xmm0
        pand    %xmm6,%xmm8
        movdqu  32-128(%rdi),%xmm1
        pand    %xmm6,%xmm9
        movdqu  64-128(%rdi),%xmm2
        pand    %xmm6,%xmm10
        movdqu  96-128(%rdi),%xmm5
        pand    %xmm6,%xmm11
        paddd   %xmm0,%xmm8
        movdqu  128-128(%rdi),%xmm0
        pand    %xmm6,%xmm12
        paddd   %xmm1,%xmm9
        movdqu  160-128(%rdi),%xmm1
        pand    %xmm6,%xmm13
        paddd   %xmm2,%xmm10
        movdqu  192-128(%rdi),%xmm2
        pand    %xmm6,%xmm14
        paddd   %xmm5,%xmm11
        movdqu  224-128(%rdi),%xmm5
        pand    %xmm6,%xmm15
        paddd   %xmm0,%xmm12
        paddd   %xmm1,%xmm13
        movdqu  %xmm8,0-128(%rdi)
        paddd   %xmm2,%xmm14
        movdqu  %xmm9,32-128(%rdi)
        paddd   %xmm5,%xmm15
        movdqu  %xmm10,64-128(%rdi)
        movdqu  %xmm11,96-128(%rdi)
        movdqu  %xmm12,128-128(%rdi)
        movdqu  %xmm13,160-128(%rdi)
        movdqu  %xmm14,192-128(%rdi)
        movdqu  %xmm15,224-128(%rdi)

        movdqa  %xmm7,(%rbx)
        movdqa  .Lpbswap(%rip),%xmm6
        decl    %edx
        jnz     .Loop

        movl    280(%rsp),%edx
        leaq    16(%rdi),%rdi
        leaq    64(%rsi),%rsi
        decl    %edx
        jnz     .Loop_grande

.Ldone:
        movq    272(%rsp),%rax
.cfi_def_cfa    %rax,8
        movq    -16(%rax),%rbp
.cfi_restore    %rbp
        movq    -8(%rax),%rbx
.cfi_restore    %rbx
        leaq    (%rax),%rsp
.cfi_def_cfa_register   %rsp
.Lepilogue:
        .byte   0xf3,0xc3
.cfi_endproc
.size   sha256_multi_block,.-sha256_multi_block
.type   sha256_multi_block_shaext,@function
.align  32
sha256_multi_block_shaext:
.cfi_startproc
_shaext_shortcut:
        movq    %rsp,%rax
.cfi_def_cfa_register   %rax
        pushq   %rbx
.cfi_offset     %rbx,-16
        pushq   %rbp
.cfi_offset     %rbp,-24
        subq    $288,%rsp
        shll    $1,%edx
        andq    $-256,%rsp
        leaq    128(%rdi),%rdi
        movq    %rax,272(%rsp)
.Lbody_shaext:
        leaq    256(%rsp),%rbx
        leaq    K256_shaext+128(%rip),%rbp

.Loop_grande_shaext:
        movl    %edx,280(%rsp)
        xorl    %edx,%edx

        movq    0(%rsi),%r8

        movl    8(%rsi),%ecx
        cmpl    %edx,%ecx
        cmovgl  %ecx,%edx
        testl   %ecx,%ecx
        movl    %ecx,0(%rbx)
        cmovleq %rsp,%r8

        movq    16(%rsi),%r9

        movl    24(%rsi),%ecx
        cmpl    %edx,%ecx
        cmovgl  %ecx,%edx
        testl   %ecx,%ecx
        movl    %ecx,4(%rbx)
        cmovleq %rsp,%r9
        testl   %edx,%edx
        jz      .Ldone_shaext

        movq    0-128(%rdi),%xmm12
        movq    32-128(%rdi),%xmm4
        movq    64-128(%rdi),%xmm13
        movq    96-128(%rdi),%xmm5
        movq    128-128(%rdi),%xmm8
        movq    160-128(%rdi),%xmm9
        movq    192-128(%rdi),%xmm10
        movq    224-128(%rdi),%xmm11

        punpckldq       %xmm4,%xmm12
        punpckldq       %xmm5,%xmm13
        punpckldq       %xmm9,%xmm8
        punpckldq       %xmm11,%xmm10
        movdqa  K256_shaext-16(%rip),%xmm3

        movdqa  %xmm12,%xmm14
        movdqa  %xmm13,%xmm15
        punpcklqdq      %xmm8,%xmm12
        punpcklqdq      %xmm10,%xmm13
        punpckhqdq      %xmm8,%xmm14
        punpckhqdq      %xmm10,%xmm15

        pshufd  $27,%xmm12,%xmm12
        pshufd  $27,%xmm13,%xmm13
        pshufd  $27,%xmm14,%xmm14
        pshufd  $27,%xmm15,%xmm15
        jmp     .Loop_shaext

.align  32
.Loop_shaext:
        movdqu  0(%r8),%xmm4
        movdqu  0(%r9),%xmm8
        movdqu  16(%r8),%xmm5
        movdqu  16(%r9),%xmm9
        movdqu  32(%r8),%xmm6
.byte   102,15,56,0,227
        movdqu  32(%r9),%xmm10
.byte   102,68,15,56,0,195
        movdqu  48(%r8),%xmm7
        leaq    64(%r8),%r8
        movdqu  48(%r9),%xmm11
        leaq    64(%r9),%r9

        movdqa  0-128(%rbp),%xmm0
.byte   102,15,56,0,235
        paddd   %xmm4,%xmm0
        pxor    %xmm12,%xmm4
        movdqa  %xmm0,%xmm1
        movdqa  0-128(%rbp),%xmm2
.byte   102,68,15,56,0,203
        paddd   %xmm8,%xmm2
        movdqa  %xmm13,80(%rsp)
.byte   69,15,56,203,236
        pxor    %xmm14,%xmm8
        movdqa  %xmm2,%xmm0
        movdqa  %xmm15,112(%rsp)
.byte   69,15,56,203,254
        pshufd  $0x0e,%xmm1,%xmm0
        pxor    %xmm12,%xmm4
        movdqa  %xmm12,64(%rsp)
.byte   69,15,56,203,229
        pshufd  $0x0e,%xmm2,%xmm0
        pxor    %xmm14,%xmm8
        movdqa  %xmm14,96(%rsp)
        movdqa  16-128(%rbp),%xmm1
        paddd   %xmm5,%xmm1
.byte   102,15,56,0,243
.byte   69,15,56,203,247

        movdqa  %xmm1,%xmm0
        movdqa  16-128(%rbp),%xmm2
        paddd   %xmm9,%xmm2
.byte   69,15,56,203,236
        movdqa  %xmm2,%xmm0
        prefetcht0      127(%r8)
.byte   102,15,56,0,251
.byte   102,68,15,56,0,211
        prefetcht0      127(%r9)
.byte   69,15,56,203,254
        pshufd  $0x0e,%xmm1,%xmm0
.byte   102,68,15,56,0,219
.byte   15,56,204,229
.byte   69,15,56,203,229
        pshufd  $0x0e,%xmm2,%xmm0
        movdqa  32-128(%rbp),%xmm1
        paddd   %xmm6,%xmm1
.byte   69,15,56,203,247

        movdqa  %xmm1,%xmm0
        movdqa  32-128(%rbp),%xmm2
        paddd   %xmm10,%xmm2
.byte   69,15,56,203,236
.byte   69,15,56,204,193
        movdqa  %xmm2,%xmm0
        movdqa  %xmm7,%xmm3
.byte   69,15,56,203,254
        pshufd  $0x0e,%xmm1,%xmm0
.byte   102,15,58,15,222,4
        paddd   %xmm3,%xmm4
        movdqa  %xmm11,%xmm3
.byte   102,65,15,58,15,218,4
.byte   15,56,204,238
.byte   69,15,56,203,229
        pshufd  $0x0e,%xmm2,%xmm0
        movdqa  48-128(%rbp),%xmm1
        paddd   %xmm7,%xmm1
.byte   69,15,56,203,247
.byte   69,15,56,204,202

        movdqa  %xmm1,%xmm0
        movdqa  48-128(%rbp),%xmm2
        paddd   %xmm3,%xmm8
        paddd   %xmm11,%xmm2
.byte   15,56,205,231
.byte   69,15,56,203,236
        movdqa  %xmm2,%xmm0
        movdqa  %xmm4,%xmm3
.byte   102,15,58,15,223,4
.byte   69,15,56,203,254
.byte   69,15,56,205,195
        pshufd  $0x0e,%xmm1,%xmm0
        paddd   %xmm3,%xmm5
        movdqa  %xmm8,%xmm3
.byte   102,65,15,58,15,219,4
.byte   15,56,204,247
.byte   69,15,56,203,229
        pshufd  $0x0e,%xmm2,%xmm0
        movdqa  64-128(%rbp),%xmm1
        paddd   %xmm4,%xmm1
.byte   69,15,56,203,247
.byte   69,15,56,204,211
        movdqa  %xmm1,%xmm0
        movdqa  64-128(%rbp),%xmm2
        paddd   %xmm3,%xmm9
        paddd   %xmm8,%xmm2
.byte   15,56,205,236
.byte   69,15,56,203,236
        movdqa  %xmm2,%xmm0
        movdqa  %xmm5,%xmm3
.byte   102,15,58,15,220,4
.byte   69,15,56,203,254
.byte   69,15,56,205,200
        pshufd  $0x0e,%xmm1,%xmm0
        paddd   %xmm3,%xmm6
        movdqa  %xmm9,%xmm3
.byte   102,65,15,58,15,216,4
.byte   15,56,204,252
.byte   69,15,56,203,229
        pshufd  $0x0e,%xmm2,%xmm0
        movdqa  80-128(%rbp),%xmm1
        paddd   %xmm5,%xmm1
.byte   69,15,56,203,247
.byte   69,15,56,204,216
        movdqa  %xmm1,%xmm0
        movdqa  80-128(%rbp),%xmm2
        paddd   %xmm3,%xmm10
        paddd   %xmm9,%xmm2
.byte   15,56,205,245
.byte   69,15,56,203,236
        movdqa  %xmm2,%xmm0
        movdqa  %xmm6,%xmm3
.byte   102,15,58,15,221,4
.byte   69,15,56,203,254
.byte   69,15,56,205,209
        pshufd  $0x0e,%xmm1,%xmm0
        paddd   %xmm3,%xmm7
        movdqa  %xmm10,%xmm3
.byte   102,65,15,58,15,217,4
.byte   15,56,204,229
.byte   69,15,56,203,229
        pshufd  $0x0e,%xmm2,%xmm0
        movdqa  96-128(%rbp),%xmm1
        paddd   %xmm6,%xmm1
.byte   69,15,56,203,247
.byte   69,15,56,204,193
        movdqa  %xmm1,%xmm0
        movdqa  96-128(%rbp),%xmm2
        paddd   %xmm3,%xmm11
        paddd   %xmm10,%xmm2
.byte   15,56,205,254
.byte   69,15,56,203,236
        movdqa  %xmm2,%xmm0
        movdqa  %xmm7,%xmm3
.byte   102,15,58,15,222,4
.byte   69,15,56,203,254
.byte   69,15,56,205,218
        pshufd  $0x0e,%xmm1,%xmm0
        paddd   %xmm3,%xmm4
        movdqa  %xmm11,%xmm3
.byte   102,65,15,58,15,218,4
.byte   15,56,204,238
.byte   69,15,56,203,229
        pshufd  $0x0e,%xmm2,%xmm0
        movdqa  112-128(%rbp),%xmm1
        paddd   %xmm7,%xmm1
.byte   69,15,56,203,247
.byte   69,15,56,204,202
        movdqa  %xmm1,%xmm0
        movdqa  112-128(%rbp),%xmm2
        paddd   %xmm3,%xmm8
        paddd   %xmm11,%xmm2
.byte   15,56,205,231
.byte   69,15,56,203,236
        movdqa  %xmm2,%xmm0
        movdqa  %xmm4,%xmm3
.byte   102,15,58,15,223,4
.byte   69,15,56,203,254
.byte   69,15,56,205,195
        pshufd  $0x0e,%xmm1,%xmm0
        paddd   %xmm3,%xmm5
        movdqa  %xmm8,%xmm3
.byte   102,65,15,58,15,219,4
.byte   15,56,204,247
.byte   69,15,56,203,229
        pshufd  $0x0e,%xmm2,%xmm0
        movdqa  128-128(%rbp),%xmm1
        paddd   %xmm4,%xmm1
.byte   69,15,56,203,247
.byte   69,15,56,204,211
        movdqa  %xmm1,%xmm0
        movdqa  128-128(%rbp),%xmm2
        paddd   %xmm3,%xmm9
        paddd   %xmm8,%xmm2
.byte   15,56,205,236
.byte   69,15,56,203,236
        movdqa  %xmm2,%xmm0
        movdqa  %xmm5,%xmm3
.byte   102,15,58,15,220,4
.byte   69,15,56,203,254
.byte   69,15,56,205,200
        pshufd  $0x0e,%xmm1,%xmm0
        paddd   %xmm3,%xmm6
        movdqa  %xmm9,%xmm3
.byte   102,65,15,58,15,216,4
.byte   15,56,204,252
.byte   69,15,56,203,229
        pshufd  $0x0e,%xmm2,%xmm0
        movdqa  144-128(%rbp),%xmm1
        paddd   %xmm5,%xmm1
.byte   69,15,56,203,247
.byte   69,15,56,204,216
        movdqa  %xmm1,%xmm0
        movdqa  144-128(%rbp),%xmm2
        paddd   %xmm3,%xmm10
        paddd   %xmm9,%xmm2
.byte   15,56,205,245
.byte   69,15,56,203,236
        movdqa  %xmm2,%xmm0
        movdqa  %xmm6,%xmm3
.byte   102,15,58,15,221,4
.byte   69,15,56,203,254
.byte   69,15,56,205,209
        pshufd  $0x0e,%xmm1,%xmm0
        paddd   %xmm3,%xmm7
        movdqa  %xmm10,%xmm3
.byte   102,65,15,58,15,217,4
.byte   15,56,204,229
.byte   69,15,56,203,229
        pshufd  $0x0e,%xmm2,%xmm0
        movdqa  160-128(%rbp),%xmm1
        paddd   %xmm6,%xmm1
.byte   69,15,56,203,247
.byte   69,15,56,204,193
        movdqa  %xmm1,%xmm0
        movdqa  160-128(%rbp),%xmm2
        paddd   %xmm3,%xmm11
        paddd   %xmm10,%xmm2
.byte   15,56,205,254
.byte   69,15,56,203,236
        movdqa  %xmm2,%xmm0
        movdqa  %xmm7,%xmm3
.byte   102,15,58,15,222,4
.byte   69,15,56,203,254
.byte   69,15,56,205,218
        pshufd  $0x0e,%xmm1,%xmm0
        paddd   %xmm3,%xmm4
        movdqa  %xmm11,%xmm3
.byte   102,65,15,58,15,218,4
.byte   15,56,204,238
.byte   69,15,56,203,229
        pshufd  $0x0e,%xmm2,%xmm0
        movdqa  176-128(%rbp),%xmm1
        paddd   %xmm7,%xmm1
.byte   69,15,56,203,247
.byte   69,15,56,204,202
        movdqa  %xmm1,%xmm0
        movdqa  176-128(%rbp),%xmm2
        paddd   %xmm3,%xmm8
        paddd   %xmm11,%xmm2
.byte   15,56,205,231
.byte   69,15,56,203,236
        movdqa  %xmm2,%xmm0
        movdqa  %xmm4,%xmm3
.byte   102,15,58,15,223,4
.byte   69,15,56,203,254
.byte   69,15,56,205,195
        pshufd  $0x0e,%xmm1,%xmm0
        paddd   %xmm3,%xmm5
        movdqa  %xmm8,%xmm3
.byte   102,65,15,58,15,219,4
.byte   15,56,204,247
.byte   69,15,56,203,229
        pshufd  $0x0e,%xmm2,%xmm0
        movdqa  192-128(%rbp),%xmm1
        paddd   %xmm4,%xmm1
.byte   69,15,56,203,247
.byte   69,15,56,204,211
        movdqa  %xmm1,%xmm0
        movdqa  192-128(%rbp),%xmm2
        paddd   %xmm3,%xmm9
        paddd   %xmm8,%xmm2
.byte   15,56,205,236
.byte   69,15,56,203,236
        movdqa  %xmm2,%xmm0
        movdqa  %xmm5,%xmm3
.byte   102,15,58,15,220,4
.byte   69,15,56,203,254
.byte   69,15,56,205,200
        pshufd  $0x0e,%xmm1,%xmm0
        paddd   %xmm3,%xmm6
        movdqa  %xmm9,%xmm3
.byte   102,65,15,58,15,216,4
.byte   15,56,204,252
.byte   69,15,56,203,229
        pshufd  $0x0e,%xmm2,%xmm0
        movdqa  208-128(%rbp),%xmm1
        paddd   %xmm5,%xmm1
.byte   69,15,56,203,247
.byte   69,15,56,204,216
        movdqa  %xmm1,%xmm0
        movdqa  208-128(%rbp),%xmm2
        paddd   %xmm3,%xmm10
        paddd   %xmm9,%xmm2
.byte   15,56,205,245
.byte   69,15,56,203,236
        movdqa  %xmm2,%xmm0
        movdqa  %xmm6,%xmm3
.byte   102,15,58,15,221,4
.byte   69,15,56,203,254
.byte   69,15,56,205,209
        pshufd  $0x0e,%xmm1,%xmm0
        paddd   %xmm3,%xmm7
        movdqa  %xmm10,%xmm3
.byte   102,65,15,58,15,217,4
        nop
.byte   69,15,56,203,229
        pshufd  $0x0e,%xmm2,%xmm0
        movdqa  224-128(%rbp),%xmm1
        paddd   %xmm6,%xmm1
.byte   69,15,56,203,247

        movdqa  %xmm1,%xmm0
        movdqa  224-128(%rbp),%xmm2
        paddd   %xmm3,%xmm11
        paddd   %xmm10,%xmm2
.byte   15,56,205,254
        nop
.byte   69,15,56,203,236
        movdqa  %xmm2,%xmm0
        movl    $1,%ecx
        pxor    %xmm6,%xmm6
.byte   69,15,56,203,254
.byte   69,15,56,205,218
        pshufd  $0x0e,%xmm1,%xmm0
        movdqa  240-128(%rbp),%xmm1
        paddd   %xmm7,%xmm1
        movq    (%rbx),%xmm7
        nop
.byte   69,15,56,203,229
        pshufd  $0x0e,%xmm2,%xmm0
        movdqa  240-128(%rbp),%xmm2
        paddd   %xmm11,%xmm2
.byte   69,15,56,203,247

        movdqa  %xmm1,%xmm0
        cmpl    0(%rbx),%ecx
        cmovgeq %rsp,%r8
        cmpl    4(%rbx),%ecx
        cmovgeq %rsp,%r9
        pshufd  $0x00,%xmm7,%xmm9
.byte   69,15,56,203,236
        movdqa  %xmm2,%xmm0
        pshufd  $0x55,%xmm7,%xmm10
        movdqa  %xmm7,%xmm11
.byte   69,15,56,203,254
        pshufd  $0x0e,%xmm1,%xmm0
        pcmpgtd %xmm6,%xmm9
        pcmpgtd %xmm6,%xmm10
.byte   69,15,56,203,229
        pshufd  $0x0e,%xmm2,%xmm0
        pcmpgtd %xmm6,%xmm11
        movdqa  K256_shaext-16(%rip),%xmm3
.byte   69,15,56,203,247

        pand    %xmm9,%xmm13
        pand    %xmm10,%xmm15
        pand    %xmm9,%xmm12
        pand    %xmm10,%xmm14
        paddd   %xmm7,%xmm11

        paddd   80(%rsp),%xmm13
        paddd   112(%rsp),%xmm15
        paddd   64(%rsp),%xmm12
        paddd   96(%rsp),%xmm14

        movq    %xmm11,(%rbx)
        decl    %edx
        jnz     .Loop_shaext

        movl    280(%rsp),%edx

        pshufd  $27,%xmm12,%xmm12
        pshufd  $27,%xmm13,%xmm13
        pshufd  $27,%xmm14,%xmm14
        pshufd  $27,%xmm15,%xmm15

        movdqa  %xmm12,%xmm5
        movdqa  %xmm13,%xmm6
        punpckldq       %xmm14,%xmm12
        punpckhdq       %xmm14,%xmm5
        punpckldq       %xmm15,%xmm13
        punpckhdq       %xmm15,%xmm6

        movq    %xmm12,0-128(%rdi)
        psrldq  $8,%xmm12
        movq    %xmm5,128-128(%rdi)
        psrldq  $8,%xmm5
        movq    %xmm12,32-128(%rdi)
        movq    %xmm5,160-128(%rdi)

        movq    %xmm13,64-128(%rdi)
        psrldq  $8,%xmm13
        movq    %xmm6,192-128(%rdi)
        psrldq  $8,%xmm6
        movq    %xmm13,96-128(%rdi)
        movq    %xmm6,224-128(%rdi)

        leaq    8(%rdi),%rdi
        leaq    32(%rsi),%rsi
        decl    %edx
        jnz     .Loop_grande_shaext

.Ldone_shaext:

        movq    -16(%rax),%rbp
.cfi_restore    %rbp
        movq    -8(%rax),%rbx
.cfi_restore    %rbx
        leaq    (%rax),%rsp
.cfi_def_cfa_register   %rsp
.Lepilogue_shaext:
        .byte   0xf3,0xc3
.cfi_endproc
.size   sha256_multi_block_shaext,.-sha256_multi_block_shaext
.type   sha256_multi_block_avx,@function
.align  32
sha256_multi_block_avx:
.cfi_startproc
_avx_shortcut:
        shrq    $32,%rcx
        cmpl    $2,%edx
        jb      .Lavx
        testl   $32,%ecx
        jnz     _avx2_shortcut
        jmp     .Lavx
.align  32
.Lavx:
        movq    %rsp,%rax
.cfi_def_cfa_register   %rax
        pushq   %rbx
.cfi_offset     %rbx,-16
        pushq   %rbp
.cfi_offset     %rbp,-24
        subq    $288,%rsp
        andq    $-256,%rsp
        movq    %rax,272(%rsp)
.cfi_escape     0x0f,0x06,0x77,0x90,0x02,0x06,0x23,0x08
.Lbody_avx:
        leaq    K256+128(%rip),%rbp
        leaq    256(%rsp),%rbx
        leaq    128(%rdi),%rdi

.Loop_grande_avx:
        movl    %edx,280(%rsp)
        xorl    %edx,%edx

        movq    0(%rsi),%r8

        movl    8(%rsi),%ecx
        cmpl    %edx,%ecx
        cmovgl  %ecx,%edx
        testl   %ecx,%ecx
        movl    %ecx,0(%rbx)
        cmovleq %rbp,%r8

        movq    16(%rsi),%r9

        movl    24(%rsi),%ecx
        cmpl    %edx,%ecx
        cmovgl  %ecx,%edx
        testl   %ecx,%ecx
        movl    %ecx,4(%rbx)
        cmovleq %rbp,%r9

        movq    32(%rsi),%r10

        movl    40(%rsi),%ecx
        cmpl    %edx,%ecx
        cmovgl  %ecx,%edx
        testl   %ecx,%ecx
        movl    %ecx,8(%rbx)
        cmovleq %rbp,%r10

        movq    48(%rsi),%r11

        movl    56(%rsi),%ecx
        cmpl    %edx,%ecx
        cmovgl  %ecx,%edx
        testl   %ecx,%ecx
        movl    %ecx,12(%rbx)
        cmovleq %rbp,%r11
        testl   %edx,%edx
        jz      .Ldone_avx

        vmovdqu 0-128(%rdi),%xmm8
        leaq    128(%rsp),%rax
        vmovdqu 32-128(%rdi),%xmm9
        vmovdqu 64-128(%rdi),%xmm10
        vmovdqu 96-128(%rdi),%xmm11
        vmovdqu 128-128(%rdi),%xmm12
        vmovdqu 160-128(%rdi),%xmm13
        vmovdqu 192-128(%rdi),%xmm14
        vmovdqu 224-128(%rdi),%xmm15
        vmovdqu .Lpbswap(%rip),%xmm6
        jmp     .Loop_avx

.align  32
.Loop_avx:
        vpxor   %xmm9,%xmm10,%xmm4
        vmovd   0(%r8),%xmm5
        vmovd   0(%r9),%xmm0
        vpinsrd $1,0(%r10),%xmm5,%xmm5
        vpinsrd $1,0(%r11),%xmm0,%xmm0
        vpunpckldq      %xmm0,%xmm5,%xmm5
        vpshufb %xmm6,%xmm5,%xmm5
        vpsrld  $6,%xmm12,%xmm7
        vpslld  $26,%xmm12,%xmm2
        vmovdqu %xmm5,0-128(%rax)
        vpaddd  %xmm15,%xmm5,%xmm5

        vpsrld  $11,%xmm12,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $21,%xmm12,%xmm2
        vpaddd  -128(%rbp),%xmm5,%xmm5
        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $25,%xmm12,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $7,%xmm12,%xmm2
        vpandn  %xmm14,%xmm12,%xmm0
        vpand   %xmm13,%xmm12,%xmm3

        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $2,%xmm8,%xmm15
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $30,%xmm8,%xmm1
        vpxor   %xmm3,%xmm0,%xmm0
        vpxor   %xmm8,%xmm9,%xmm3

        vpxor   %xmm1,%xmm15,%xmm15
        vpaddd  %xmm7,%xmm5,%xmm5

        vpsrld  $13,%xmm8,%xmm1

        vpslld  $19,%xmm8,%xmm2
        vpaddd  %xmm0,%xmm5,%xmm5
        vpand   %xmm3,%xmm4,%xmm4

        vpxor   %xmm1,%xmm15,%xmm7

        vpsrld  $22,%xmm8,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $10,%xmm8,%xmm2
        vpxor   %xmm4,%xmm9,%xmm15
        vpaddd  %xmm5,%xmm11,%xmm11

        vpxor   %xmm1,%xmm7,%xmm7
        vpxor   %xmm2,%xmm7,%xmm7

        vpaddd  %xmm5,%xmm15,%xmm15
        vpaddd  %xmm7,%xmm15,%xmm15
        vmovd   4(%r8),%xmm5
        vmovd   4(%r9),%xmm0
        vpinsrd $1,4(%r10),%xmm5,%xmm5
        vpinsrd $1,4(%r11),%xmm0,%xmm0
        vpunpckldq      %xmm0,%xmm5,%xmm5
        vpshufb %xmm6,%xmm5,%xmm5
        vpsrld  $6,%xmm11,%xmm7
        vpslld  $26,%xmm11,%xmm2
        vmovdqu %xmm5,16-128(%rax)
        vpaddd  %xmm14,%xmm5,%xmm5

        vpsrld  $11,%xmm11,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $21,%xmm11,%xmm2
        vpaddd  -96(%rbp),%xmm5,%xmm5
        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $25,%xmm11,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $7,%xmm11,%xmm2
        vpandn  %xmm13,%xmm11,%xmm0
        vpand   %xmm12,%xmm11,%xmm4

        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $2,%xmm15,%xmm14
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $30,%xmm15,%xmm1
        vpxor   %xmm4,%xmm0,%xmm0
        vpxor   %xmm15,%xmm8,%xmm4

        vpxor   %xmm1,%xmm14,%xmm14
        vpaddd  %xmm7,%xmm5,%xmm5

        vpsrld  $13,%xmm15,%xmm1

        vpslld  $19,%xmm15,%xmm2
        vpaddd  %xmm0,%xmm5,%xmm5
        vpand   %xmm4,%xmm3,%xmm3

        vpxor   %xmm1,%xmm14,%xmm7

        vpsrld  $22,%xmm15,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $10,%xmm15,%xmm2
        vpxor   %xmm3,%xmm8,%xmm14
        vpaddd  %xmm5,%xmm10,%xmm10

        vpxor   %xmm1,%xmm7,%xmm7
        vpxor   %xmm2,%xmm7,%xmm7

        vpaddd  %xmm5,%xmm14,%xmm14
        vpaddd  %xmm7,%xmm14,%xmm14
        vmovd   8(%r8),%xmm5
        vmovd   8(%r9),%xmm0
        vpinsrd $1,8(%r10),%xmm5,%xmm5
        vpinsrd $1,8(%r11),%xmm0,%xmm0
        vpunpckldq      %xmm0,%xmm5,%xmm5
        vpshufb %xmm6,%xmm5,%xmm5
        vpsrld  $6,%xmm10,%xmm7
        vpslld  $26,%xmm10,%xmm2
        vmovdqu %xmm5,32-128(%rax)
        vpaddd  %xmm13,%xmm5,%xmm5

        vpsrld  $11,%xmm10,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $21,%xmm10,%xmm2
        vpaddd  -64(%rbp),%xmm5,%xmm5
        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $25,%xmm10,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $7,%xmm10,%xmm2
        vpandn  %xmm12,%xmm10,%xmm0
        vpand   %xmm11,%xmm10,%xmm3

        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $2,%xmm14,%xmm13
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $30,%xmm14,%xmm1
        vpxor   %xmm3,%xmm0,%xmm0
        vpxor   %xmm14,%xmm15,%xmm3

        vpxor   %xmm1,%xmm13,%xmm13
        vpaddd  %xmm7,%xmm5,%xmm5

        vpsrld  $13,%xmm14,%xmm1

        vpslld  $19,%xmm14,%xmm2
        vpaddd  %xmm0,%xmm5,%xmm5
        vpand   %xmm3,%xmm4,%xmm4

        vpxor   %xmm1,%xmm13,%xmm7

        vpsrld  $22,%xmm14,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $10,%xmm14,%xmm2
        vpxor   %xmm4,%xmm15,%xmm13
        vpaddd  %xmm5,%xmm9,%xmm9

        vpxor   %xmm1,%xmm7,%xmm7
        vpxor   %xmm2,%xmm7,%xmm7

        vpaddd  %xmm5,%xmm13,%xmm13
        vpaddd  %xmm7,%xmm13,%xmm13
        vmovd   12(%r8),%xmm5
        vmovd   12(%r9),%xmm0
        vpinsrd $1,12(%r10),%xmm5,%xmm5
        vpinsrd $1,12(%r11),%xmm0,%xmm0
        vpunpckldq      %xmm0,%xmm5,%xmm5
        vpshufb %xmm6,%xmm5,%xmm5
        vpsrld  $6,%xmm9,%xmm7
        vpslld  $26,%xmm9,%xmm2
        vmovdqu %xmm5,48-128(%rax)
        vpaddd  %xmm12,%xmm5,%xmm5

        vpsrld  $11,%xmm9,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $21,%xmm9,%xmm2
        vpaddd  -32(%rbp),%xmm5,%xmm5
        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $25,%xmm9,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $7,%xmm9,%xmm2
        vpandn  %xmm11,%xmm9,%xmm0
        vpand   %xmm10,%xmm9,%xmm4

        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $2,%xmm13,%xmm12
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $30,%xmm13,%xmm1
        vpxor   %xmm4,%xmm0,%xmm0
        vpxor   %xmm13,%xmm14,%xmm4

        vpxor   %xmm1,%xmm12,%xmm12
        vpaddd  %xmm7,%xmm5,%xmm5

        vpsrld  $13,%xmm13,%xmm1

        vpslld  $19,%xmm13,%xmm2
        vpaddd  %xmm0,%xmm5,%xmm5
        vpand   %xmm4,%xmm3,%xmm3

        vpxor   %xmm1,%xmm12,%xmm7

        vpsrld  $22,%xmm13,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $10,%xmm13,%xmm2
        vpxor   %xmm3,%xmm14,%xmm12
        vpaddd  %xmm5,%xmm8,%xmm8

        vpxor   %xmm1,%xmm7,%xmm7
        vpxor   %xmm2,%xmm7,%xmm7

        vpaddd  %xmm5,%xmm12,%xmm12
        vpaddd  %xmm7,%xmm12,%xmm12
        vmovd   16(%r8),%xmm5
        vmovd   16(%r9),%xmm0
        vpinsrd $1,16(%r10),%xmm5,%xmm5
        vpinsrd $1,16(%r11),%xmm0,%xmm0
        vpunpckldq      %xmm0,%xmm5,%xmm5
        vpshufb %xmm6,%xmm5,%xmm5
        vpsrld  $6,%xmm8,%xmm7
        vpslld  $26,%xmm8,%xmm2
        vmovdqu %xmm5,64-128(%rax)
        vpaddd  %xmm11,%xmm5,%xmm5

        vpsrld  $11,%xmm8,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $21,%xmm8,%xmm2
        vpaddd  0(%rbp),%xmm5,%xmm5
        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $25,%xmm8,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $7,%xmm8,%xmm2
        vpandn  %xmm10,%xmm8,%xmm0
        vpand   %xmm9,%xmm8,%xmm3

        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $2,%xmm12,%xmm11
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $30,%xmm12,%xmm1
        vpxor   %xmm3,%xmm0,%xmm0
        vpxor   %xmm12,%xmm13,%xmm3

        vpxor   %xmm1,%xmm11,%xmm11
        vpaddd  %xmm7,%xmm5,%xmm5

        vpsrld  $13,%xmm12,%xmm1

        vpslld  $19,%xmm12,%xmm2
        vpaddd  %xmm0,%xmm5,%xmm5
        vpand   %xmm3,%xmm4,%xmm4

        vpxor   %xmm1,%xmm11,%xmm7

        vpsrld  $22,%xmm12,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $10,%xmm12,%xmm2
        vpxor   %xmm4,%xmm13,%xmm11
        vpaddd  %xmm5,%xmm15,%xmm15

        vpxor   %xmm1,%xmm7,%xmm7
        vpxor   %xmm2,%xmm7,%xmm7

        vpaddd  %xmm5,%xmm11,%xmm11
        vpaddd  %xmm7,%xmm11,%xmm11
        vmovd   20(%r8),%xmm5
        vmovd   20(%r9),%xmm0
        vpinsrd $1,20(%r10),%xmm5,%xmm5
        vpinsrd $1,20(%r11),%xmm0,%xmm0
        vpunpckldq      %xmm0,%xmm5,%xmm5
        vpshufb %xmm6,%xmm5,%xmm5
        vpsrld  $6,%xmm15,%xmm7
        vpslld  $26,%xmm15,%xmm2
        vmovdqu %xmm5,80-128(%rax)
        vpaddd  %xmm10,%xmm5,%xmm5

        vpsrld  $11,%xmm15,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $21,%xmm15,%xmm2
        vpaddd  32(%rbp),%xmm5,%xmm5
        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $25,%xmm15,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $7,%xmm15,%xmm2
        vpandn  %xmm9,%xmm15,%xmm0
        vpand   %xmm8,%xmm15,%xmm4

        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $2,%xmm11,%xmm10
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $30,%xmm11,%xmm1
        vpxor   %xmm4,%xmm0,%xmm0
        vpxor   %xmm11,%xmm12,%xmm4

        vpxor   %xmm1,%xmm10,%xmm10
        vpaddd  %xmm7,%xmm5,%xmm5

        vpsrld  $13,%xmm11,%xmm1

        vpslld  $19,%xmm11,%xmm2
        vpaddd  %xmm0,%xmm5,%xmm5
        vpand   %xmm4,%xmm3,%xmm3

        vpxor   %xmm1,%xmm10,%xmm7

        vpsrld  $22,%xmm11,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $10,%xmm11,%xmm2
        vpxor   %xmm3,%xmm12,%xmm10
        vpaddd  %xmm5,%xmm14,%xmm14

        vpxor   %xmm1,%xmm7,%xmm7
        vpxor   %xmm2,%xmm7,%xmm7

        vpaddd  %xmm5,%xmm10,%xmm10
        vpaddd  %xmm7,%xmm10,%xmm10
        vmovd   24(%r8),%xmm5
        vmovd   24(%r9),%xmm0
        vpinsrd $1,24(%r10),%xmm5,%xmm5
        vpinsrd $1,24(%r11),%xmm0,%xmm0
        vpunpckldq      %xmm0,%xmm5,%xmm5
        vpshufb %xmm6,%xmm5,%xmm5
        vpsrld  $6,%xmm14,%xmm7
        vpslld  $26,%xmm14,%xmm2
        vmovdqu %xmm5,96-128(%rax)
        vpaddd  %xmm9,%xmm5,%xmm5

        vpsrld  $11,%xmm14,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $21,%xmm14,%xmm2
        vpaddd  64(%rbp),%xmm5,%xmm5
        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $25,%xmm14,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $7,%xmm14,%xmm2
        vpandn  %xmm8,%xmm14,%xmm0
        vpand   %xmm15,%xmm14,%xmm3

        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $2,%xmm10,%xmm9
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $30,%xmm10,%xmm1
        vpxor   %xmm3,%xmm0,%xmm0
        vpxor   %xmm10,%xmm11,%xmm3

        vpxor   %xmm1,%xmm9,%xmm9
        vpaddd  %xmm7,%xmm5,%xmm5

        vpsrld  $13,%xmm10,%xmm1

        vpslld  $19,%xmm10,%xmm2
        vpaddd  %xmm0,%xmm5,%xmm5
        vpand   %xmm3,%xmm4,%xmm4

        vpxor   %xmm1,%xmm9,%xmm7

        vpsrld  $22,%xmm10,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $10,%xmm10,%xmm2
        vpxor   %xmm4,%xmm11,%xmm9
        vpaddd  %xmm5,%xmm13,%xmm13

        vpxor   %xmm1,%xmm7,%xmm7
        vpxor   %xmm2,%xmm7,%xmm7

        vpaddd  %xmm5,%xmm9,%xmm9
        vpaddd  %xmm7,%xmm9,%xmm9
        vmovd   28(%r8),%xmm5
        vmovd   28(%r9),%xmm0
        vpinsrd $1,28(%r10),%xmm5,%xmm5
        vpinsrd $1,28(%r11),%xmm0,%xmm0
        vpunpckldq      %xmm0,%xmm5,%xmm5
        vpshufb %xmm6,%xmm5,%xmm5
        vpsrld  $6,%xmm13,%xmm7
        vpslld  $26,%xmm13,%xmm2
        vmovdqu %xmm5,112-128(%rax)
        vpaddd  %xmm8,%xmm5,%xmm5

        vpsrld  $11,%xmm13,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $21,%xmm13,%xmm2
        vpaddd  96(%rbp),%xmm5,%xmm5
        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $25,%xmm13,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $7,%xmm13,%xmm2
        vpandn  %xmm15,%xmm13,%xmm0
        vpand   %xmm14,%xmm13,%xmm4

        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $2,%xmm9,%xmm8
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $30,%xmm9,%xmm1
        vpxor   %xmm4,%xmm0,%xmm0
        vpxor   %xmm9,%xmm10,%xmm4

        vpxor   %xmm1,%xmm8,%xmm8
        vpaddd  %xmm7,%xmm5,%xmm5

        vpsrld  $13,%xmm9,%xmm1

        vpslld  $19,%xmm9,%xmm2
        vpaddd  %xmm0,%xmm5,%xmm5
        vpand   %xmm4,%xmm3,%xmm3

        vpxor   %xmm1,%xmm8,%xmm7

        vpsrld  $22,%xmm9,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $10,%xmm9,%xmm2
        vpxor   %xmm3,%xmm10,%xmm8
        vpaddd  %xmm5,%xmm12,%xmm12

        vpxor   %xmm1,%xmm7,%xmm7
        vpxor   %xmm2,%xmm7,%xmm7

        vpaddd  %xmm5,%xmm8,%xmm8
        vpaddd  %xmm7,%xmm8,%xmm8
        addq    $256,%rbp
        vmovd   32(%r8),%xmm5
        vmovd   32(%r9),%xmm0
        vpinsrd $1,32(%r10),%xmm5,%xmm5
        vpinsrd $1,32(%r11),%xmm0,%xmm0
        vpunpckldq      %xmm0,%xmm5,%xmm5
        vpshufb %xmm6,%xmm5,%xmm5
        vpsrld  $6,%xmm12,%xmm7
        vpslld  $26,%xmm12,%xmm2
        vmovdqu %xmm5,128-128(%rax)
        vpaddd  %xmm15,%xmm5,%xmm5

        vpsrld  $11,%xmm12,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $21,%xmm12,%xmm2
        vpaddd  -128(%rbp),%xmm5,%xmm5
        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $25,%xmm12,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $7,%xmm12,%xmm2
        vpandn  %xmm14,%xmm12,%xmm0
        vpand   %xmm13,%xmm12,%xmm3

        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $2,%xmm8,%xmm15
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $30,%xmm8,%xmm1
        vpxor   %xmm3,%xmm0,%xmm0
        vpxor   %xmm8,%xmm9,%xmm3

        vpxor   %xmm1,%xmm15,%xmm15
        vpaddd  %xmm7,%xmm5,%xmm5

        vpsrld  $13,%xmm8,%xmm1

        vpslld  $19,%xmm8,%xmm2
        vpaddd  %xmm0,%xmm5,%xmm5
        vpand   %xmm3,%xmm4,%xmm4

        vpxor   %xmm1,%xmm15,%xmm7

        vpsrld  $22,%xmm8,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $10,%xmm8,%xmm2
        vpxor   %xmm4,%xmm9,%xmm15
        vpaddd  %xmm5,%xmm11,%xmm11

        vpxor   %xmm1,%xmm7,%xmm7
        vpxor   %xmm2,%xmm7,%xmm7

        vpaddd  %xmm5,%xmm15,%xmm15
        vpaddd  %xmm7,%xmm15,%xmm15
        vmovd   36(%r8),%xmm5
        vmovd   36(%r9),%xmm0
        vpinsrd $1,36(%r10),%xmm5,%xmm5
        vpinsrd $1,36(%r11),%xmm0,%xmm0
        vpunpckldq      %xmm0,%xmm5,%xmm5
        vpshufb %xmm6,%xmm5,%xmm5
        vpsrld  $6,%xmm11,%xmm7
        vpslld  $26,%xmm11,%xmm2
        vmovdqu %xmm5,144-128(%rax)
        vpaddd  %xmm14,%xmm5,%xmm5

        vpsrld  $11,%xmm11,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $21,%xmm11,%xmm2
        vpaddd  -96(%rbp),%xmm5,%xmm5
        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $25,%xmm11,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $7,%xmm11,%xmm2
        vpandn  %xmm13,%xmm11,%xmm0
        vpand   %xmm12,%xmm11,%xmm4

        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $2,%xmm15,%xmm14
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $30,%xmm15,%xmm1
        vpxor   %xmm4,%xmm0,%xmm0
        vpxor   %xmm15,%xmm8,%xmm4

        vpxor   %xmm1,%xmm14,%xmm14
        vpaddd  %xmm7,%xmm5,%xmm5

        vpsrld  $13,%xmm15,%xmm1

        vpslld  $19,%xmm15,%xmm2
        vpaddd  %xmm0,%xmm5,%xmm5
        vpand   %xmm4,%xmm3,%xmm3

        vpxor   %xmm1,%xmm14,%xmm7

        vpsrld  $22,%xmm15,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $10,%xmm15,%xmm2
        vpxor   %xmm3,%xmm8,%xmm14
        vpaddd  %xmm5,%xmm10,%xmm10

        vpxor   %xmm1,%xmm7,%xmm7
        vpxor   %xmm2,%xmm7,%xmm7

        vpaddd  %xmm5,%xmm14,%xmm14
        vpaddd  %xmm7,%xmm14,%xmm14
        vmovd   40(%r8),%xmm5
        vmovd   40(%r9),%xmm0
        vpinsrd $1,40(%r10),%xmm5,%xmm5
        vpinsrd $1,40(%r11),%xmm0,%xmm0
        vpunpckldq      %xmm0,%xmm5,%xmm5
        vpshufb %xmm6,%xmm5,%xmm5
        vpsrld  $6,%xmm10,%xmm7
        vpslld  $26,%xmm10,%xmm2
        vmovdqu %xmm5,160-128(%rax)
        vpaddd  %xmm13,%xmm5,%xmm5

        vpsrld  $11,%xmm10,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $21,%xmm10,%xmm2
        vpaddd  -64(%rbp),%xmm5,%xmm5
        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $25,%xmm10,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $7,%xmm10,%xmm2
        vpandn  %xmm12,%xmm10,%xmm0
        vpand   %xmm11,%xmm10,%xmm3

        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $2,%xmm14,%xmm13
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $30,%xmm14,%xmm1
        vpxor   %xmm3,%xmm0,%xmm0
        vpxor   %xmm14,%xmm15,%xmm3

        vpxor   %xmm1,%xmm13,%xmm13
        vpaddd  %xmm7,%xmm5,%xmm5

        vpsrld  $13,%xmm14,%xmm1

        vpslld  $19,%xmm14,%xmm2
        vpaddd  %xmm0,%xmm5,%xmm5
        vpand   %xmm3,%xmm4,%xmm4

        vpxor   %xmm1,%xmm13,%xmm7

        vpsrld  $22,%xmm14,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $10,%xmm14,%xmm2
        vpxor   %xmm4,%xmm15,%xmm13
        vpaddd  %xmm5,%xmm9,%xmm9

        vpxor   %xmm1,%xmm7,%xmm7
        vpxor   %xmm2,%xmm7,%xmm7

        vpaddd  %xmm5,%xmm13,%xmm13
        vpaddd  %xmm7,%xmm13,%xmm13
        vmovd   44(%r8),%xmm5
        vmovd   44(%r9),%xmm0
        vpinsrd $1,44(%r10),%xmm5,%xmm5
        vpinsrd $1,44(%r11),%xmm0,%xmm0
        vpunpckldq      %xmm0,%xmm5,%xmm5
        vpshufb %xmm6,%xmm5,%xmm5
        vpsrld  $6,%xmm9,%xmm7
        vpslld  $26,%xmm9,%xmm2
        vmovdqu %xmm5,176-128(%rax)
        vpaddd  %xmm12,%xmm5,%xmm5

        vpsrld  $11,%xmm9,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $21,%xmm9,%xmm2
        vpaddd  -32(%rbp),%xmm5,%xmm5
        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $25,%xmm9,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $7,%xmm9,%xmm2
        vpandn  %xmm11,%xmm9,%xmm0
        vpand   %xmm10,%xmm9,%xmm4

        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $2,%xmm13,%xmm12
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $30,%xmm13,%xmm1
        vpxor   %xmm4,%xmm0,%xmm0
        vpxor   %xmm13,%xmm14,%xmm4

        vpxor   %xmm1,%xmm12,%xmm12
        vpaddd  %xmm7,%xmm5,%xmm5

        vpsrld  $13,%xmm13,%xmm1

        vpslld  $19,%xmm13,%xmm2
        vpaddd  %xmm0,%xmm5,%xmm5
        vpand   %xmm4,%xmm3,%xmm3

        vpxor   %xmm1,%xmm12,%xmm7

        vpsrld  $22,%xmm13,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $10,%xmm13,%xmm2
        vpxor   %xmm3,%xmm14,%xmm12
        vpaddd  %xmm5,%xmm8,%xmm8

        vpxor   %xmm1,%xmm7,%xmm7
        vpxor   %xmm2,%xmm7,%xmm7

        vpaddd  %xmm5,%xmm12,%xmm12
        vpaddd  %xmm7,%xmm12,%xmm12
        vmovd   48(%r8),%xmm5
        vmovd   48(%r9),%xmm0
        vpinsrd $1,48(%r10),%xmm5,%xmm5
        vpinsrd $1,48(%r11),%xmm0,%xmm0
        vpunpckldq      %xmm0,%xmm5,%xmm5
        vpshufb %xmm6,%xmm5,%xmm5
        vpsrld  $6,%xmm8,%xmm7
        vpslld  $26,%xmm8,%xmm2
        vmovdqu %xmm5,192-128(%rax)
        vpaddd  %xmm11,%xmm5,%xmm5

        vpsrld  $11,%xmm8,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $21,%xmm8,%xmm2
        vpaddd  0(%rbp),%xmm5,%xmm5
        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $25,%xmm8,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $7,%xmm8,%xmm2
        vpandn  %xmm10,%xmm8,%xmm0
        vpand   %xmm9,%xmm8,%xmm3

        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $2,%xmm12,%xmm11
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $30,%xmm12,%xmm1
        vpxor   %xmm3,%xmm0,%xmm0
        vpxor   %xmm12,%xmm13,%xmm3

        vpxor   %xmm1,%xmm11,%xmm11
        vpaddd  %xmm7,%xmm5,%xmm5

        vpsrld  $13,%xmm12,%xmm1

        vpslld  $19,%xmm12,%xmm2
        vpaddd  %xmm0,%xmm5,%xmm5
        vpand   %xmm3,%xmm4,%xmm4

        vpxor   %xmm1,%xmm11,%xmm7

        vpsrld  $22,%xmm12,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $10,%xmm12,%xmm2
        vpxor   %xmm4,%xmm13,%xmm11
        vpaddd  %xmm5,%xmm15,%xmm15

        vpxor   %xmm1,%xmm7,%xmm7
        vpxor   %xmm2,%xmm7,%xmm7

        vpaddd  %xmm5,%xmm11,%xmm11
        vpaddd  %xmm7,%xmm11,%xmm11
        vmovd   52(%r8),%xmm5
        vmovd   52(%r9),%xmm0
        vpinsrd $1,52(%r10),%xmm5,%xmm5
        vpinsrd $1,52(%r11),%xmm0,%xmm0
        vpunpckldq      %xmm0,%xmm5,%xmm5
        vpshufb %xmm6,%xmm5,%xmm5
        vpsrld  $6,%xmm15,%xmm7
        vpslld  $26,%xmm15,%xmm2
        vmovdqu %xmm5,208-128(%rax)
        vpaddd  %xmm10,%xmm5,%xmm5

        vpsrld  $11,%xmm15,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $21,%xmm15,%xmm2
        vpaddd  32(%rbp),%xmm5,%xmm5
        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $25,%xmm15,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $7,%xmm15,%xmm2
        vpandn  %xmm9,%xmm15,%xmm0
        vpand   %xmm8,%xmm15,%xmm4

        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $2,%xmm11,%xmm10
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $30,%xmm11,%xmm1
        vpxor   %xmm4,%xmm0,%xmm0
        vpxor   %xmm11,%xmm12,%xmm4

        vpxor   %xmm1,%xmm10,%xmm10
        vpaddd  %xmm7,%xmm5,%xmm5

        vpsrld  $13,%xmm11,%xmm1

        vpslld  $19,%xmm11,%xmm2
        vpaddd  %xmm0,%xmm5,%xmm5
        vpand   %xmm4,%xmm3,%xmm3

        vpxor   %xmm1,%xmm10,%xmm7

        vpsrld  $22,%xmm11,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $10,%xmm11,%xmm2
        vpxor   %xmm3,%xmm12,%xmm10
        vpaddd  %xmm5,%xmm14,%xmm14

        vpxor   %xmm1,%xmm7,%xmm7
        vpxor   %xmm2,%xmm7,%xmm7

        vpaddd  %xmm5,%xmm10,%xmm10
        vpaddd  %xmm7,%xmm10,%xmm10
        vmovd   56(%r8),%xmm5
        vmovd   56(%r9),%xmm0
        vpinsrd $1,56(%r10),%xmm5,%xmm5
        vpinsrd $1,56(%r11),%xmm0,%xmm0
        vpunpckldq      %xmm0,%xmm5,%xmm5
        vpshufb %xmm6,%xmm5,%xmm5
        vpsrld  $6,%xmm14,%xmm7
        vpslld  $26,%xmm14,%xmm2
        vmovdqu %xmm5,224-128(%rax)
        vpaddd  %xmm9,%xmm5,%xmm5

        vpsrld  $11,%xmm14,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $21,%xmm14,%xmm2
        vpaddd  64(%rbp),%xmm5,%xmm5
        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $25,%xmm14,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $7,%xmm14,%xmm2
        vpandn  %xmm8,%xmm14,%xmm0
        vpand   %xmm15,%xmm14,%xmm3

        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $2,%xmm10,%xmm9
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $30,%xmm10,%xmm1
        vpxor   %xmm3,%xmm0,%xmm0
        vpxor   %xmm10,%xmm11,%xmm3

        vpxor   %xmm1,%xmm9,%xmm9
        vpaddd  %xmm7,%xmm5,%xmm5

        vpsrld  $13,%xmm10,%xmm1

        vpslld  $19,%xmm10,%xmm2
        vpaddd  %xmm0,%xmm5,%xmm5
        vpand   %xmm3,%xmm4,%xmm4

        vpxor   %xmm1,%xmm9,%xmm7

        vpsrld  $22,%xmm10,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $10,%xmm10,%xmm2
        vpxor   %xmm4,%xmm11,%xmm9
        vpaddd  %xmm5,%xmm13,%xmm13

        vpxor   %xmm1,%xmm7,%xmm7
        vpxor   %xmm2,%xmm7,%xmm7

        vpaddd  %xmm5,%xmm9,%xmm9
        vpaddd  %xmm7,%xmm9,%xmm9
        vmovd   60(%r8),%xmm5
        leaq    64(%r8),%r8
        vmovd   60(%r9),%xmm0
        leaq    64(%r9),%r9
        vpinsrd $1,60(%r10),%xmm5,%xmm5
        leaq    64(%r10),%r10
        vpinsrd $1,60(%r11),%xmm0,%xmm0
        leaq    64(%r11),%r11
        vpunpckldq      %xmm0,%xmm5,%xmm5
        vpshufb %xmm6,%xmm5,%xmm5
        vpsrld  $6,%xmm13,%xmm7
        vpslld  $26,%xmm13,%xmm2
        vmovdqu %xmm5,240-128(%rax)
        vpaddd  %xmm8,%xmm5,%xmm5

        vpsrld  $11,%xmm13,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $21,%xmm13,%xmm2
        vpaddd  96(%rbp),%xmm5,%xmm5
        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $25,%xmm13,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        prefetcht0      63(%r8)
        vpslld  $7,%xmm13,%xmm2
        vpandn  %xmm15,%xmm13,%xmm0
        vpand   %xmm14,%xmm13,%xmm4
        prefetcht0      63(%r9)
        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $2,%xmm9,%xmm8
        vpxor   %xmm2,%xmm7,%xmm7
        prefetcht0      63(%r10)
        vpslld  $30,%xmm9,%xmm1
        vpxor   %xmm4,%xmm0,%xmm0
        vpxor   %xmm9,%xmm10,%xmm4
        prefetcht0      63(%r11)
        vpxor   %xmm1,%xmm8,%xmm8
        vpaddd  %xmm7,%xmm5,%xmm5

        vpsrld  $13,%xmm9,%xmm1

        vpslld  $19,%xmm9,%xmm2
        vpaddd  %xmm0,%xmm5,%xmm5
        vpand   %xmm4,%xmm3,%xmm3

        vpxor   %xmm1,%xmm8,%xmm7

        vpsrld  $22,%xmm9,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $10,%xmm9,%xmm2
        vpxor   %xmm3,%xmm10,%xmm8
        vpaddd  %xmm5,%xmm12,%xmm12

        vpxor   %xmm1,%xmm7,%xmm7
        vpxor   %xmm2,%xmm7,%xmm7

        vpaddd  %xmm5,%xmm8,%xmm8
        vpaddd  %xmm7,%xmm8,%xmm8
        addq    $256,%rbp
        vmovdqu 0-128(%rax),%xmm5
        movl    $3,%ecx
        jmp     .Loop_16_xx_avx
.align  32
.Loop_16_xx_avx:
        vmovdqu 16-128(%rax),%xmm6
        vpaddd  144-128(%rax),%xmm5,%xmm5

        vpsrld  $3,%xmm6,%xmm7
        vpsrld  $7,%xmm6,%xmm1
        vpslld  $25,%xmm6,%xmm2
        vpxor   %xmm1,%xmm7,%xmm7
        vpsrld  $18,%xmm6,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $14,%xmm6,%xmm2
        vmovdqu 224-128(%rax),%xmm0
        vpsrld  $10,%xmm0,%xmm3

        vpxor   %xmm1,%xmm7,%xmm7
        vpsrld  $17,%xmm0,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $15,%xmm0,%xmm2
        vpaddd  %xmm7,%xmm5,%xmm5
        vpxor   %xmm1,%xmm3,%xmm7
        vpsrld  $19,%xmm0,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $13,%xmm0,%xmm2
        vpxor   %xmm1,%xmm7,%xmm7
        vpxor   %xmm2,%xmm7,%xmm7
        vpaddd  %xmm7,%xmm5,%xmm5
        vpsrld  $6,%xmm12,%xmm7
        vpslld  $26,%xmm12,%xmm2
        vmovdqu %xmm5,0-128(%rax)
        vpaddd  %xmm15,%xmm5,%xmm5

        vpsrld  $11,%xmm12,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $21,%xmm12,%xmm2
        vpaddd  -128(%rbp),%xmm5,%xmm5
        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $25,%xmm12,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $7,%xmm12,%xmm2
        vpandn  %xmm14,%xmm12,%xmm0
        vpand   %xmm13,%xmm12,%xmm3

        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $2,%xmm8,%xmm15
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $30,%xmm8,%xmm1
        vpxor   %xmm3,%xmm0,%xmm0
        vpxor   %xmm8,%xmm9,%xmm3

        vpxor   %xmm1,%xmm15,%xmm15
        vpaddd  %xmm7,%xmm5,%xmm5

        vpsrld  $13,%xmm8,%xmm1

        vpslld  $19,%xmm8,%xmm2
        vpaddd  %xmm0,%xmm5,%xmm5
        vpand   %xmm3,%xmm4,%xmm4

        vpxor   %xmm1,%xmm15,%xmm7

        vpsrld  $22,%xmm8,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $10,%xmm8,%xmm2
        vpxor   %xmm4,%xmm9,%xmm15
        vpaddd  %xmm5,%xmm11,%xmm11

        vpxor   %xmm1,%xmm7,%xmm7
        vpxor   %xmm2,%xmm7,%xmm7

        vpaddd  %xmm5,%xmm15,%xmm15
        vpaddd  %xmm7,%xmm15,%xmm15
        vmovdqu 32-128(%rax),%xmm5
        vpaddd  160-128(%rax),%xmm6,%xmm6

        vpsrld  $3,%xmm5,%xmm7
        vpsrld  $7,%xmm5,%xmm1
        vpslld  $25,%xmm5,%xmm2
        vpxor   %xmm1,%xmm7,%xmm7
        vpsrld  $18,%xmm5,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $14,%xmm5,%xmm2
        vmovdqu 240-128(%rax),%xmm0
        vpsrld  $10,%xmm0,%xmm4

        vpxor   %xmm1,%xmm7,%xmm7
        vpsrld  $17,%xmm0,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $15,%xmm0,%xmm2
        vpaddd  %xmm7,%xmm6,%xmm6
        vpxor   %xmm1,%xmm4,%xmm7
        vpsrld  $19,%xmm0,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $13,%xmm0,%xmm2
        vpxor   %xmm1,%xmm7,%xmm7
        vpxor   %xmm2,%xmm7,%xmm7
        vpaddd  %xmm7,%xmm6,%xmm6
        vpsrld  $6,%xmm11,%xmm7
        vpslld  $26,%xmm11,%xmm2
        vmovdqu %xmm6,16-128(%rax)
        vpaddd  %xmm14,%xmm6,%xmm6

        vpsrld  $11,%xmm11,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $21,%xmm11,%xmm2
        vpaddd  -96(%rbp),%xmm6,%xmm6
        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $25,%xmm11,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $7,%xmm11,%xmm2
        vpandn  %xmm13,%xmm11,%xmm0
        vpand   %xmm12,%xmm11,%xmm4

        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $2,%xmm15,%xmm14
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $30,%xmm15,%xmm1
        vpxor   %xmm4,%xmm0,%xmm0
        vpxor   %xmm15,%xmm8,%xmm4

        vpxor   %xmm1,%xmm14,%xmm14
        vpaddd  %xmm7,%xmm6,%xmm6

        vpsrld  $13,%xmm15,%xmm1

        vpslld  $19,%xmm15,%xmm2
        vpaddd  %xmm0,%xmm6,%xmm6
        vpand   %xmm4,%xmm3,%xmm3

        vpxor   %xmm1,%xmm14,%xmm7

        vpsrld  $22,%xmm15,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $10,%xmm15,%xmm2
        vpxor   %xmm3,%xmm8,%xmm14
        vpaddd  %xmm6,%xmm10,%xmm10

        vpxor   %xmm1,%xmm7,%xmm7
        vpxor   %xmm2,%xmm7,%xmm7

        vpaddd  %xmm6,%xmm14,%xmm14
        vpaddd  %xmm7,%xmm14,%xmm14
        vmovdqu 48-128(%rax),%xmm6
        vpaddd  176-128(%rax),%xmm5,%xmm5

        vpsrld  $3,%xmm6,%xmm7
        vpsrld  $7,%xmm6,%xmm1
        vpslld  $25,%xmm6,%xmm2
        vpxor   %xmm1,%xmm7,%xmm7
        vpsrld  $18,%xmm6,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $14,%xmm6,%xmm2
        vmovdqu 0-128(%rax),%xmm0
        vpsrld  $10,%xmm0,%xmm3

        vpxor   %xmm1,%xmm7,%xmm7
        vpsrld  $17,%xmm0,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $15,%xmm0,%xmm2
        vpaddd  %xmm7,%xmm5,%xmm5
        vpxor   %xmm1,%xmm3,%xmm7
        vpsrld  $19,%xmm0,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $13,%xmm0,%xmm2
        vpxor   %xmm1,%xmm7,%xmm7
        vpxor   %xmm2,%xmm7,%xmm7
        vpaddd  %xmm7,%xmm5,%xmm5
        vpsrld  $6,%xmm10,%xmm7
        vpslld  $26,%xmm10,%xmm2
        vmovdqu %xmm5,32-128(%rax)
        vpaddd  %xmm13,%xmm5,%xmm5

        vpsrld  $11,%xmm10,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $21,%xmm10,%xmm2
        vpaddd  -64(%rbp),%xmm5,%xmm5
        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $25,%xmm10,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $7,%xmm10,%xmm2
        vpandn  %xmm12,%xmm10,%xmm0
        vpand   %xmm11,%xmm10,%xmm3

        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $2,%xmm14,%xmm13
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $30,%xmm14,%xmm1
        vpxor   %xmm3,%xmm0,%xmm0
        vpxor   %xmm14,%xmm15,%xmm3

        vpxor   %xmm1,%xmm13,%xmm13
        vpaddd  %xmm7,%xmm5,%xmm5

        vpsrld  $13,%xmm14,%xmm1

        vpslld  $19,%xmm14,%xmm2
        vpaddd  %xmm0,%xmm5,%xmm5
        vpand   %xmm3,%xmm4,%xmm4

        vpxor   %xmm1,%xmm13,%xmm7

        vpsrld  $22,%xmm14,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $10,%xmm14,%xmm2
        vpxor   %xmm4,%xmm15,%xmm13
        vpaddd  %xmm5,%xmm9,%xmm9

        vpxor   %xmm1,%xmm7,%xmm7
        vpxor   %xmm2,%xmm7,%xmm7

        vpaddd  %xmm5,%xmm13,%xmm13
        vpaddd  %xmm7,%xmm13,%xmm13
        vmovdqu 64-128(%rax),%xmm5
        vpaddd  192-128(%rax),%xmm6,%xmm6

        vpsrld  $3,%xmm5,%xmm7
        vpsrld  $7,%xmm5,%xmm1
        vpslld  $25,%xmm5,%xmm2
        vpxor   %xmm1,%xmm7,%xmm7
        vpsrld  $18,%xmm5,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $14,%xmm5,%xmm2
        vmovdqu 16-128(%rax),%xmm0
        vpsrld  $10,%xmm0,%xmm4

        vpxor   %xmm1,%xmm7,%xmm7
        vpsrld  $17,%xmm0,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $15,%xmm0,%xmm2
        vpaddd  %xmm7,%xmm6,%xmm6
        vpxor   %xmm1,%xmm4,%xmm7
        vpsrld  $19,%xmm0,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $13,%xmm0,%xmm2
        vpxor   %xmm1,%xmm7,%xmm7
        vpxor   %xmm2,%xmm7,%xmm7
        vpaddd  %xmm7,%xmm6,%xmm6
        vpsrld  $6,%xmm9,%xmm7
        vpslld  $26,%xmm9,%xmm2
        vmovdqu %xmm6,48-128(%rax)
        vpaddd  %xmm12,%xmm6,%xmm6

        vpsrld  $11,%xmm9,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $21,%xmm9,%xmm2
        vpaddd  -32(%rbp),%xmm6,%xmm6
        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $25,%xmm9,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $7,%xmm9,%xmm2
        vpandn  %xmm11,%xmm9,%xmm0
        vpand   %xmm10,%xmm9,%xmm4

        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $2,%xmm13,%xmm12
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $30,%xmm13,%xmm1
        vpxor   %xmm4,%xmm0,%xmm0
        vpxor   %xmm13,%xmm14,%xmm4

        vpxor   %xmm1,%xmm12,%xmm12
        vpaddd  %xmm7,%xmm6,%xmm6

        vpsrld  $13,%xmm13,%xmm1

        vpslld  $19,%xmm13,%xmm2
        vpaddd  %xmm0,%xmm6,%xmm6
        vpand   %xmm4,%xmm3,%xmm3

        vpxor   %xmm1,%xmm12,%xmm7

        vpsrld  $22,%xmm13,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $10,%xmm13,%xmm2
        vpxor   %xmm3,%xmm14,%xmm12
        vpaddd  %xmm6,%xmm8,%xmm8

        vpxor   %xmm1,%xmm7,%xmm7
        vpxor   %xmm2,%xmm7,%xmm7

        vpaddd  %xmm6,%xmm12,%xmm12
        vpaddd  %xmm7,%xmm12,%xmm12
        vmovdqu 80-128(%rax),%xmm6
        vpaddd  208-128(%rax),%xmm5,%xmm5

        vpsrld  $3,%xmm6,%xmm7
        vpsrld  $7,%xmm6,%xmm1
        vpslld  $25,%xmm6,%xmm2
        vpxor   %xmm1,%xmm7,%xmm7
        vpsrld  $18,%xmm6,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $14,%xmm6,%xmm2
        vmovdqu 32-128(%rax),%xmm0
        vpsrld  $10,%xmm0,%xmm3

        vpxor   %xmm1,%xmm7,%xmm7
        vpsrld  $17,%xmm0,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $15,%xmm0,%xmm2
        vpaddd  %xmm7,%xmm5,%xmm5
        vpxor   %xmm1,%xmm3,%xmm7
        vpsrld  $19,%xmm0,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $13,%xmm0,%xmm2
        vpxor   %xmm1,%xmm7,%xmm7
        vpxor   %xmm2,%xmm7,%xmm7
        vpaddd  %xmm7,%xmm5,%xmm5
        vpsrld  $6,%xmm8,%xmm7
        vpslld  $26,%xmm8,%xmm2
        vmovdqu %xmm5,64-128(%rax)
        vpaddd  %xmm11,%xmm5,%xmm5

        vpsrld  $11,%xmm8,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $21,%xmm8,%xmm2
        vpaddd  0(%rbp),%xmm5,%xmm5
        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $25,%xmm8,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $7,%xmm8,%xmm2
        vpandn  %xmm10,%xmm8,%xmm0
        vpand   %xmm9,%xmm8,%xmm3

        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $2,%xmm12,%xmm11
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $30,%xmm12,%xmm1
        vpxor   %xmm3,%xmm0,%xmm0
        vpxor   %xmm12,%xmm13,%xmm3

        vpxor   %xmm1,%xmm11,%xmm11
        vpaddd  %xmm7,%xmm5,%xmm5

        vpsrld  $13,%xmm12,%xmm1

        vpslld  $19,%xmm12,%xmm2
        vpaddd  %xmm0,%xmm5,%xmm5
        vpand   %xmm3,%xmm4,%xmm4

        vpxor   %xmm1,%xmm11,%xmm7

        vpsrld  $22,%xmm12,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $10,%xmm12,%xmm2
        vpxor   %xmm4,%xmm13,%xmm11
        vpaddd  %xmm5,%xmm15,%xmm15

        vpxor   %xmm1,%xmm7,%xmm7
        vpxor   %xmm2,%xmm7,%xmm7

        vpaddd  %xmm5,%xmm11,%xmm11
        vpaddd  %xmm7,%xmm11,%xmm11
        vmovdqu 96-128(%rax),%xmm5
        vpaddd  224-128(%rax),%xmm6,%xmm6

        vpsrld  $3,%xmm5,%xmm7
        vpsrld  $7,%xmm5,%xmm1
        vpslld  $25,%xmm5,%xmm2
        vpxor   %xmm1,%xmm7,%xmm7
        vpsrld  $18,%xmm5,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $14,%xmm5,%xmm2
        vmovdqu 48-128(%rax),%xmm0
        vpsrld  $10,%xmm0,%xmm4

        vpxor   %xmm1,%xmm7,%xmm7
        vpsrld  $17,%xmm0,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $15,%xmm0,%xmm2
        vpaddd  %xmm7,%xmm6,%xmm6
        vpxor   %xmm1,%xmm4,%xmm7
        vpsrld  $19,%xmm0,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $13,%xmm0,%xmm2
        vpxor   %xmm1,%xmm7,%xmm7
        vpxor   %xmm2,%xmm7,%xmm7
        vpaddd  %xmm7,%xmm6,%xmm6
        vpsrld  $6,%xmm15,%xmm7
        vpslld  $26,%xmm15,%xmm2
        vmovdqu %xmm6,80-128(%rax)
        vpaddd  %xmm10,%xmm6,%xmm6

        vpsrld  $11,%xmm15,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $21,%xmm15,%xmm2
        vpaddd  32(%rbp),%xmm6,%xmm6
        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $25,%xmm15,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $7,%xmm15,%xmm2
        vpandn  %xmm9,%xmm15,%xmm0
        vpand   %xmm8,%xmm15,%xmm4

        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $2,%xmm11,%xmm10
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $30,%xmm11,%xmm1
        vpxor   %xmm4,%xmm0,%xmm0
        vpxor   %xmm11,%xmm12,%xmm4

        vpxor   %xmm1,%xmm10,%xmm10
        vpaddd  %xmm7,%xmm6,%xmm6

        vpsrld  $13,%xmm11,%xmm1

        vpslld  $19,%xmm11,%xmm2
        vpaddd  %xmm0,%xmm6,%xmm6
        vpand   %xmm4,%xmm3,%xmm3

        vpxor   %xmm1,%xmm10,%xmm7

        vpsrld  $22,%xmm11,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $10,%xmm11,%xmm2
        vpxor   %xmm3,%xmm12,%xmm10
        vpaddd  %xmm6,%xmm14,%xmm14

        vpxor   %xmm1,%xmm7,%xmm7
        vpxor   %xmm2,%xmm7,%xmm7

        vpaddd  %xmm6,%xmm10,%xmm10
        vpaddd  %xmm7,%xmm10,%xmm10
        vmovdqu 112-128(%rax),%xmm6
        vpaddd  240-128(%rax),%xmm5,%xmm5

        vpsrld  $3,%xmm6,%xmm7
        vpsrld  $7,%xmm6,%xmm1
        vpslld  $25,%xmm6,%xmm2
        vpxor   %xmm1,%xmm7,%xmm7
        vpsrld  $18,%xmm6,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $14,%xmm6,%xmm2
        vmovdqu 64-128(%rax),%xmm0
        vpsrld  $10,%xmm0,%xmm3

        vpxor   %xmm1,%xmm7,%xmm7
        vpsrld  $17,%xmm0,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $15,%xmm0,%xmm2
        vpaddd  %xmm7,%xmm5,%xmm5
        vpxor   %xmm1,%xmm3,%xmm7
        vpsrld  $19,%xmm0,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $13,%xmm0,%xmm2
        vpxor   %xmm1,%xmm7,%xmm7
        vpxor   %xmm2,%xmm7,%xmm7
        vpaddd  %xmm7,%xmm5,%xmm5
        vpsrld  $6,%xmm14,%xmm7
        vpslld  $26,%xmm14,%xmm2
        vmovdqu %xmm5,96-128(%rax)
        vpaddd  %xmm9,%xmm5,%xmm5

        vpsrld  $11,%xmm14,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $21,%xmm14,%xmm2
        vpaddd  64(%rbp),%xmm5,%xmm5
        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $25,%xmm14,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $7,%xmm14,%xmm2
        vpandn  %xmm8,%xmm14,%xmm0
        vpand   %xmm15,%xmm14,%xmm3

        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $2,%xmm10,%xmm9
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $30,%xmm10,%xmm1
        vpxor   %xmm3,%xmm0,%xmm0
        vpxor   %xmm10,%xmm11,%xmm3

        vpxor   %xmm1,%xmm9,%xmm9
        vpaddd  %xmm7,%xmm5,%xmm5

        vpsrld  $13,%xmm10,%xmm1

        vpslld  $19,%xmm10,%xmm2
        vpaddd  %xmm0,%xmm5,%xmm5
        vpand   %xmm3,%xmm4,%xmm4

        vpxor   %xmm1,%xmm9,%xmm7

        vpsrld  $22,%xmm10,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $10,%xmm10,%xmm2
        vpxor   %xmm4,%xmm11,%xmm9
        vpaddd  %xmm5,%xmm13,%xmm13

        vpxor   %xmm1,%xmm7,%xmm7
        vpxor   %xmm2,%xmm7,%xmm7

        vpaddd  %xmm5,%xmm9,%xmm9
        vpaddd  %xmm7,%xmm9,%xmm9
        vmovdqu 128-128(%rax),%xmm5
        vpaddd  0-128(%rax),%xmm6,%xmm6

        vpsrld  $3,%xmm5,%xmm7
        vpsrld  $7,%xmm5,%xmm1
        vpslld  $25,%xmm5,%xmm2
        vpxor   %xmm1,%xmm7,%xmm7
        vpsrld  $18,%xmm5,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $14,%xmm5,%xmm2
        vmovdqu 80-128(%rax),%xmm0
        vpsrld  $10,%xmm0,%xmm4

        vpxor   %xmm1,%xmm7,%xmm7
        vpsrld  $17,%xmm0,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $15,%xmm0,%xmm2
        vpaddd  %xmm7,%xmm6,%xmm6
        vpxor   %xmm1,%xmm4,%xmm7
        vpsrld  $19,%xmm0,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $13,%xmm0,%xmm2
        vpxor   %xmm1,%xmm7,%xmm7
        vpxor   %xmm2,%xmm7,%xmm7
        vpaddd  %xmm7,%xmm6,%xmm6
        vpsrld  $6,%xmm13,%xmm7
        vpslld  $26,%xmm13,%xmm2
        vmovdqu %xmm6,112-128(%rax)
        vpaddd  %xmm8,%xmm6,%xmm6

        vpsrld  $11,%xmm13,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $21,%xmm13,%xmm2
        vpaddd  96(%rbp),%xmm6,%xmm6
        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $25,%xmm13,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $7,%xmm13,%xmm2
        vpandn  %xmm15,%xmm13,%xmm0
        vpand   %xmm14,%xmm13,%xmm4

        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $2,%xmm9,%xmm8
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $30,%xmm9,%xmm1
        vpxor   %xmm4,%xmm0,%xmm0
        vpxor   %xmm9,%xmm10,%xmm4

        vpxor   %xmm1,%xmm8,%xmm8
        vpaddd  %xmm7,%xmm6,%xmm6

        vpsrld  $13,%xmm9,%xmm1

        vpslld  $19,%xmm9,%xmm2
        vpaddd  %xmm0,%xmm6,%xmm6
        vpand   %xmm4,%xmm3,%xmm3

        vpxor   %xmm1,%xmm8,%xmm7

        vpsrld  $22,%xmm9,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $10,%xmm9,%xmm2
        vpxor   %xmm3,%xmm10,%xmm8
        vpaddd  %xmm6,%xmm12,%xmm12

        vpxor   %xmm1,%xmm7,%xmm7
        vpxor   %xmm2,%xmm7,%xmm7

        vpaddd  %xmm6,%xmm8,%xmm8
        vpaddd  %xmm7,%xmm8,%xmm8
        addq    $256,%rbp
        vmovdqu 144-128(%rax),%xmm6
        vpaddd  16-128(%rax),%xmm5,%xmm5

        vpsrld  $3,%xmm6,%xmm7
        vpsrld  $7,%xmm6,%xmm1
        vpslld  $25,%xmm6,%xmm2
        vpxor   %xmm1,%xmm7,%xmm7
        vpsrld  $18,%xmm6,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $14,%xmm6,%xmm2
        vmovdqu 96-128(%rax),%xmm0
        vpsrld  $10,%xmm0,%xmm3

        vpxor   %xmm1,%xmm7,%xmm7
        vpsrld  $17,%xmm0,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $15,%xmm0,%xmm2
        vpaddd  %xmm7,%xmm5,%xmm5
        vpxor   %xmm1,%xmm3,%xmm7
        vpsrld  $19,%xmm0,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $13,%xmm0,%xmm2
        vpxor   %xmm1,%xmm7,%xmm7
        vpxor   %xmm2,%xmm7,%xmm7
        vpaddd  %xmm7,%xmm5,%xmm5
        vpsrld  $6,%xmm12,%xmm7
        vpslld  $26,%xmm12,%xmm2
        vmovdqu %xmm5,128-128(%rax)
        vpaddd  %xmm15,%xmm5,%xmm5

        vpsrld  $11,%xmm12,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $21,%xmm12,%xmm2
        vpaddd  -128(%rbp),%xmm5,%xmm5
        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $25,%xmm12,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $7,%xmm12,%xmm2
        vpandn  %xmm14,%xmm12,%xmm0
        vpand   %xmm13,%xmm12,%xmm3

        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $2,%xmm8,%xmm15
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $30,%xmm8,%xmm1
        vpxor   %xmm3,%xmm0,%xmm0
        vpxor   %xmm8,%xmm9,%xmm3

        vpxor   %xmm1,%xmm15,%xmm15
        vpaddd  %xmm7,%xmm5,%xmm5

        vpsrld  $13,%xmm8,%xmm1

        vpslld  $19,%xmm8,%xmm2
        vpaddd  %xmm0,%xmm5,%xmm5
        vpand   %xmm3,%xmm4,%xmm4

        vpxor   %xmm1,%xmm15,%xmm7

        vpsrld  $22,%xmm8,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $10,%xmm8,%xmm2
        vpxor   %xmm4,%xmm9,%xmm15
        vpaddd  %xmm5,%xmm11,%xmm11

        vpxor   %xmm1,%xmm7,%xmm7
        vpxor   %xmm2,%xmm7,%xmm7

        vpaddd  %xmm5,%xmm15,%xmm15
        vpaddd  %xmm7,%xmm15,%xmm15
        vmovdqu 160-128(%rax),%xmm5
        vpaddd  32-128(%rax),%xmm6,%xmm6

        vpsrld  $3,%xmm5,%xmm7
        vpsrld  $7,%xmm5,%xmm1
        vpslld  $25,%xmm5,%xmm2
        vpxor   %xmm1,%xmm7,%xmm7
        vpsrld  $18,%xmm5,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $14,%xmm5,%xmm2
        vmovdqu 112-128(%rax),%xmm0
        vpsrld  $10,%xmm0,%xmm4

        vpxor   %xmm1,%xmm7,%xmm7
        vpsrld  $17,%xmm0,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $15,%xmm0,%xmm2
        vpaddd  %xmm7,%xmm6,%xmm6
        vpxor   %xmm1,%xmm4,%xmm7
        vpsrld  $19,%xmm0,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $13,%xmm0,%xmm2
        vpxor   %xmm1,%xmm7,%xmm7
        vpxor   %xmm2,%xmm7,%xmm7
        vpaddd  %xmm7,%xmm6,%xmm6
        vpsrld  $6,%xmm11,%xmm7
        vpslld  $26,%xmm11,%xmm2
        vmovdqu %xmm6,144-128(%rax)
        vpaddd  %xmm14,%xmm6,%xmm6

        vpsrld  $11,%xmm11,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $21,%xmm11,%xmm2
        vpaddd  -96(%rbp),%xmm6,%xmm6
        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $25,%xmm11,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $7,%xmm11,%xmm2
        vpandn  %xmm13,%xmm11,%xmm0
        vpand   %xmm12,%xmm11,%xmm4

        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $2,%xmm15,%xmm14
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $30,%xmm15,%xmm1
        vpxor   %xmm4,%xmm0,%xmm0
        vpxor   %xmm15,%xmm8,%xmm4

        vpxor   %xmm1,%xmm14,%xmm14
        vpaddd  %xmm7,%xmm6,%xmm6

        vpsrld  $13,%xmm15,%xmm1

        vpslld  $19,%xmm15,%xmm2
        vpaddd  %xmm0,%xmm6,%xmm6
        vpand   %xmm4,%xmm3,%xmm3

        vpxor   %xmm1,%xmm14,%xmm7

        vpsrld  $22,%xmm15,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $10,%xmm15,%xmm2
        vpxor   %xmm3,%xmm8,%xmm14
        vpaddd  %xmm6,%xmm10,%xmm10

        vpxor   %xmm1,%xmm7,%xmm7
        vpxor   %xmm2,%xmm7,%xmm7

        vpaddd  %xmm6,%xmm14,%xmm14
        vpaddd  %xmm7,%xmm14,%xmm14
        vmovdqu 176-128(%rax),%xmm6
        vpaddd  48-128(%rax),%xmm5,%xmm5

        vpsrld  $3,%xmm6,%xmm7
        vpsrld  $7,%xmm6,%xmm1
        vpslld  $25,%xmm6,%xmm2
        vpxor   %xmm1,%xmm7,%xmm7
        vpsrld  $18,%xmm6,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $14,%xmm6,%xmm2
        vmovdqu 128-128(%rax),%xmm0
        vpsrld  $10,%xmm0,%xmm3

        vpxor   %xmm1,%xmm7,%xmm7
        vpsrld  $17,%xmm0,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $15,%xmm0,%xmm2
        vpaddd  %xmm7,%xmm5,%xmm5
        vpxor   %xmm1,%xmm3,%xmm7
        vpsrld  $19,%xmm0,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $13,%xmm0,%xmm2
        vpxor   %xmm1,%xmm7,%xmm7
        vpxor   %xmm2,%xmm7,%xmm7
        vpaddd  %xmm7,%xmm5,%xmm5
        vpsrld  $6,%xmm10,%xmm7
        vpslld  $26,%xmm10,%xmm2
        vmovdqu %xmm5,160-128(%rax)
        vpaddd  %xmm13,%xmm5,%xmm5

        vpsrld  $11,%xmm10,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $21,%xmm10,%xmm2
        vpaddd  -64(%rbp),%xmm5,%xmm5
        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $25,%xmm10,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $7,%xmm10,%xmm2
        vpandn  %xmm12,%xmm10,%xmm0
        vpand   %xmm11,%xmm10,%xmm3

        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $2,%xmm14,%xmm13
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $30,%xmm14,%xmm1
        vpxor   %xmm3,%xmm0,%xmm0
        vpxor   %xmm14,%xmm15,%xmm3

        vpxor   %xmm1,%xmm13,%xmm13
        vpaddd  %xmm7,%xmm5,%xmm5

        vpsrld  $13,%xmm14,%xmm1

        vpslld  $19,%xmm14,%xmm2
        vpaddd  %xmm0,%xmm5,%xmm5
        vpand   %xmm3,%xmm4,%xmm4

        vpxor   %xmm1,%xmm13,%xmm7

        vpsrld  $22,%xmm14,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $10,%xmm14,%xmm2
        vpxor   %xmm4,%xmm15,%xmm13
        vpaddd  %xmm5,%xmm9,%xmm9

        vpxor   %xmm1,%xmm7,%xmm7
        vpxor   %xmm2,%xmm7,%xmm7

        vpaddd  %xmm5,%xmm13,%xmm13
        vpaddd  %xmm7,%xmm13,%xmm13
        vmovdqu 192-128(%rax),%xmm5
        vpaddd  64-128(%rax),%xmm6,%xmm6

        vpsrld  $3,%xmm5,%xmm7
        vpsrld  $7,%xmm5,%xmm1
        vpslld  $25,%xmm5,%xmm2
        vpxor   %xmm1,%xmm7,%xmm7
        vpsrld  $18,%xmm5,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $14,%xmm5,%xmm2
        vmovdqu 144-128(%rax),%xmm0
        vpsrld  $10,%xmm0,%xmm4

        vpxor   %xmm1,%xmm7,%xmm7
        vpsrld  $17,%xmm0,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $15,%xmm0,%xmm2
        vpaddd  %xmm7,%xmm6,%xmm6
        vpxor   %xmm1,%xmm4,%xmm7
        vpsrld  $19,%xmm0,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $13,%xmm0,%xmm2
        vpxor   %xmm1,%xmm7,%xmm7
        vpxor   %xmm2,%xmm7,%xmm7
        vpaddd  %xmm7,%xmm6,%xmm6
        vpsrld  $6,%xmm9,%xmm7
        vpslld  $26,%xmm9,%xmm2
        vmovdqu %xmm6,176-128(%rax)
        vpaddd  %xmm12,%xmm6,%xmm6

        vpsrld  $11,%xmm9,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $21,%xmm9,%xmm2
        vpaddd  -32(%rbp),%xmm6,%xmm6
        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $25,%xmm9,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $7,%xmm9,%xmm2
        vpandn  %xmm11,%xmm9,%xmm0
        vpand   %xmm10,%xmm9,%xmm4

        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $2,%xmm13,%xmm12
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $30,%xmm13,%xmm1
        vpxor   %xmm4,%xmm0,%xmm0
        vpxor   %xmm13,%xmm14,%xmm4

        vpxor   %xmm1,%xmm12,%xmm12
        vpaddd  %xmm7,%xmm6,%xmm6

        vpsrld  $13,%xmm13,%xmm1

        vpslld  $19,%xmm13,%xmm2
        vpaddd  %xmm0,%xmm6,%xmm6
        vpand   %xmm4,%xmm3,%xmm3

        vpxor   %xmm1,%xmm12,%xmm7

        vpsrld  $22,%xmm13,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $10,%xmm13,%xmm2
        vpxor   %xmm3,%xmm14,%xmm12
        vpaddd  %xmm6,%xmm8,%xmm8

        vpxor   %xmm1,%xmm7,%xmm7
        vpxor   %xmm2,%xmm7,%xmm7

        vpaddd  %xmm6,%xmm12,%xmm12
        vpaddd  %xmm7,%xmm12,%xmm12
        vmovdqu 208-128(%rax),%xmm6
        vpaddd  80-128(%rax),%xmm5,%xmm5

        vpsrld  $3,%xmm6,%xmm7
        vpsrld  $7,%xmm6,%xmm1
        vpslld  $25,%xmm6,%xmm2
        vpxor   %xmm1,%xmm7,%xmm7
        vpsrld  $18,%xmm6,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $14,%xmm6,%xmm2
        vmovdqu 160-128(%rax),%xmm0
        vpsrld  $10,%xmm0,%xmm3

        vpxor   %xmm1,%xmm7,%xmm7
        vpsrld  $17,%xmm0,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $15,%xmm0,%xmm2
        vpaddd  %xmm7,%xmm5,%xmm5
        vpxor   %xmm1,%xmm3,%xmm7
        vpsrld  $19,%xmm0,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $13,%xmm0,%xmm2
        vpxor   %xmm1,%xmm7,%xmm7
        vpxor   %xmm2,%xmm7,%xmm7
        vpaddd  %xmm7,%xmm5,%xmm5
        vpsrld  $6,%xmm8,%xmm7
        vpslld  $26,%xmm8,%xmm2
        vmovdqu %xmm5,192-128(%rax)
        vpaddd  %xmm11,%xmm5,%xmm5

        vpsrld  $11,%xmm8,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $21,%xmm8,%xmm2
        vpaddd  0(%rbp),%xmm5,%xmm5
        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $25,%xmm8,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $7,%xmm8,%xmm2
        vpandn  %xmm10,%xmm8,%xmm0
        vpand   %xmm9,%xmm8,%xmm3

        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $2,%xmm12,%xmm11
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $30,%xmm12,%xmm1
        vpxor   %xmm3,%xmm0,%xmm0
        vpxor   %xmm12,%xmm13,%xmm3

        vpxor   %xmm1,%xmm11,%xmm11
        vpaddd  %xmm7,%xmm5,%xmm5

        vpsrld  $13,%xmm12,%xmm1

        vpslld  $19,%xmm12,%xmm2
        vpaddd  %xmm0,%xmm5,%xmm5
        vpand   %xmm3,%xmm4,%xmm4

        vpxor   %xmm1,%xmm11,%xmm7

        vpsrld  $22,%xmm12,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $10,%xmm12,%xmm2
        vpxor   %xmm4,%xmm13,%xmm11
        vpaddd  %xmm5,%xmm15,%xmm15

        vpxor   %xmm1,%xmm7,%xmm7
        vpxor   %xmm2,%xmm7,%xmm7

        vpaddd  %xmm5,%xmm11,%xmm11
        vpaddd  %xmm7,%xmm11,%xmm11
        vmovdqu 224-128(%rax),%xmm5
        vpaddd  96-128(%rax),%xmm6,%xmm6

        vpsrld  $3,%xmm5,%xmm7
        vpsrld  $7,%xmm5,%xmm1
        vpslld  $25,%xmm5,%xmm2
        vpxor   %xmm1,%xmm7,%xmm7
        vpsrld  $18,%xmm5,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $14,%xmm5,%xmm2
        vmovdqu 176-128(%rax),%xmm0
        vpsrld  $10,%xmm0,%xmm4

        vpxor   %xmm1,%xmm7,%xmm7
        vpsrld  $17,%xmm0,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $15,%xmm0,%xmm2
        vpaddd  %xmm7,%xmm6,%xmm6
        vpxor   %xmm1,%xmm4,%xmm7
        vpsrld  $19,%xmm0,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $13,%xmm0,%xmm2
        vpxor   %xmm1,%xmm7,%xmm7
        vpxor   %xmm2,%xmm7,%xmm7
        vpaddd  %xmm7,%xmm6,%xmm6
        vpsrld  $6,%xmm15,%xmm7
        vpslld  $26,%xmm15,%xmm2
        vmovdqu %xmm6,208-128(%rax)
        vpaddd  %xmm10,%xmm6,%xmm6

        vpsrld  $11,%xmm15,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $21,%xmm15,%xmm2
        vpaddd  32(%rbp),%xmm6,%xmm6
        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $25,%xmm15,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $7,%xmm15,%xmm2
        vpandn  %xmm9,%xmm15,%xmm0
        vpand   %xmm8,%xmm15,%xmm4

        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $2,%xmm11,%xmm10
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $30,%xmm11,%xmm1
        vpxor   %xmm4,%xmm0,%xmm0
        vpxor   %xmm11,%xmm12,%xmm4

        vpxor   %xmm1,%xmm10,%xmm10
        vpaddd  %xmm7,%xmm6,%xmm6

        vpsrld  $13,%xmm11,%xmm1

        vpslld  $19,%xmm11,%xmm2
        vpaddd  %xmm0,%xmm6,%xmm6
        vpand   %xmm4,%xmm3,%xmm3

        vpxor   %xmm1,%xmm10,%xmm7

        vpsrld  $22,%xmm11,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $10,%xmm11,%xmm2
        vpxor   %xmm3,%xmm12,%xmm10
        vpaddd  %xmm6,%xmm14,%xmm14

        vpxor   %xmm1,%xmm7,%xmm7
        vpxor   %xmm2,%xmm7,%xmm7

        vpaddd  %xmm6,%xmm10,%xmm10
        vpaddd  %xmm7,%xmm10,%xmm10
        vmovdqu 240-128(%rax),%xmm6
        vpaddd  112-128(%rax),%xmm5,%xmm5

        vpsrld  $3,%xmm6,%xmm7
        vpsrld  $7,%xmm6,%xmm1
        vpslld  $25,%xmm6,%xmm2
        vpxor   %xmm1,%xmm7,%xmm7
        vpsrld  $18,%xmm6,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $14,%xmm6,%xmm2
        vmovdqu 192-128(%rax),%xmm0
        vpsrld  $10,%xmm0,%xmm3

        vpxor   %xmm1,%xmm7,%xmm7
        vpsrld  $17,%xmm0,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $15,%xmm0,%xmm2
        vpaddd  %xmm7,%xmm5,%xmm5
        vpxor   %xmm1,%xmm3,%xmm7
        vpsrld  $19,%xmm0,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $13,%xmm0,%xmm2
        vpxor   %xmm1,%xmm7,%xmm7
        vpxor   %xmm2,%xmm7,%xmm7
        vpaddd  %xmm7,%xmm5,%xmm5
        vpsrld  $6,%xmm14,%xmm7
        vpslld  $26,%xmm14,%xmm2
        vmovdqu %xmm5,224-128(%rax)
        vpaddd  %xmm9,%xmm5,%xmm5

        vpsrld  $11,%xmm14,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $21,%xmm14,%xmm2
        vpaddd  64(%rbp),%xmm5,%xmm5
        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $25,%xmm14,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $7,%xmm14,%xmm2
        vpandn  %xmm8,%xmm14,%xmm0
        vpand   %xmm15,%xmm14,%xmm3

        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $2,%xmm10,%xmm9
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $30,%xmm10,%xmm1
        vpxor   %xmm3,%xmm0,%xmm0
        vpxor   %xmm10,%xmm11,%xmm3

        vpxor   %xmm1,%xmm9,%xmm9
        vpaddd  %xmm7,%xmm5,%xmm5

        vpsrld  $13,%xmm10,%xmm1

        vpslld  $19,%xmm10,%xmm2
        vpaddd  %xmm0,%xmm5,%xmm5
        vpand   %xmm3,%xmm4,%xmm4

        vpxor   %xmm1,%xmm9,%xmm7

        vpsrld  $22,%xmm10,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $10,%xmm10,%xmm2
        vpxor   %xmm4,%xmm11,%xmm9
        vpaddd  %xmm5,%xmm13,%xmm13

        vpxor   %xmm1,%xmm7,%xmm7
        vpxor   %xmm2,%xmm7,%xmm7

        vpaddd  %xmm5,%xmm9,%xmm9
        vpaddd  %xmm7,%xmm9,%xmm9
        vmovdqu 0-128(%rax),%xmm5
        vpaddd  128-128(%rax),%xmm6,%xmm6

        vpsrld  $3,%xmm5,%xmm7
        vpsrld  $7,%xmm5,%xmm1
        vpslld  $25,%xmm5,%xmm2
        vpxor   %xmm1,%xmm7,%xmm7
        vpsrld  $18,%xmm5,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $14,%xmm5,%xmm2
        vmovdqu 208-128(%rax),%xmm0
        vpsrld  $10,%xmm0,%xmm4

        vpxor   %xmm1,%xmm7,%xmm7
        vpsrld  $17,%xmm0,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $15,%xmm0,%xmm2
        vpaddd  %xmm7,%xmm6,%xmm6
        vpxor   %xmm1,%xmm4,%xmm7
        vpsrld  $19,%xmm0,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $13,%xmm0,%xmm2
        vpxor   %xmm1,%xmm7,%xmm7
        vpxor   %xmm2,%xmm7,%xmm7
        vpaddd  %xmm7,%xmm6,%xmm6
        vpsrld  $6,%xmm13,%xmm7
        vpslld  $26,%xmm13,%xmm2
        vmovdqu %xmm6,240-128(%rax)
        vpaddd  %xmm8,%xmm6,%xmm6

        vpsrld  $11,%xmm13,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7
        vpslld  $21,%xmm13,%xmm2
        vpaddd  96(%rbp),%xmm6,%xmm6
        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $25,%xmm13,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $7,%xmm13,%xmm2
        vpandn  %xmm15,%xmm13,%xmm0
        vpand   %xmm14,%xmm13,%xmm4

        vpxor   %xmm1,%xmm7,%xmm7

        vpsrld  $2,%xmm9,%xmm8
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $30,%xmm9,%xmm1
        vpxor   %xmm4,%xmm0,%xmm0
        vpxor   %xmm9,%xmm10,%xmm4

        vpxor   %xmm1,%xmm8,%xmm8
        vpaddd  %xmm7,%xmm6,%xmm6

        vpsrld  $13,%xmm9,%xmm1

        vpslld  $19,%xmm9,%xmm2
        vpaddd  %xmm0,%xmm6,%xmm6
        vpand   %xmm4,%xmm3,%xmm3

        vpxor   %xmm1,%xmm8,%xmm7

        vpsrld  $22,%xmm9,%xmm1
        vpxor   %xmm2,%xmm7,%xmm7

        vpslld  $10,%xmm9,%xmm2
        vpxor   %xmm3,%xmm10,%xmm8
        vpaddd  %xmm6,%xmm12,%xmm12

        vpxor   %xmm1,%xmm7,%xmm7
        vpxor   %xmm2,%xmm7,%xmm7

        vpaddd  %xmm6,%xmm8,%xmm8
        vpaddd  %xmm7,%xmm8,%xmm8
        addq    $256,%rbp
        decl    %ecx
        jnz     .Loop_16_xx_avx

        movl    $1,%ecx
        leaq    K256+128(%rip),%rbp
        cmpl    0(%rbx),%ecx
        cmovgeq %rbp,%r8
        cmpl    4(%rbx),%ecx
        cmovgeq %rbp,%r9
        cmpl    8(%rbx),%ecx
        cmovgeq %rbp,%r10
        cmpl    12(%rbx),%ecx
        cmovgeq %rbp,%r11
        vmovdqa (%rbx),%xmm7
        vpxor   %xmm0,%xmm0,%xmm0
        vmovdqa %xmm7,%xmm6
        vpcmpgtd        %xmm0,%xmm6,%xmm6
        vpaddd  %xmm6,%xmm7,%xmm7

        vmovdqu 0-128(%rdi),%xmm0
        vpand   %xmm6,%xmm8,%xmm8
        vmovdqu 32-128(%rdi),%xmm1
        vpand   %xmm6,%xmm9,%xmm9
        vmovdqu 64-128(%rdi),%xmm2
        vpand   %xmm6,%xmm10,%xmm10
        vmovdqu 96-128(%rdi),%xmm5
        vpand   %xmm6,%xmm11,%xmm11
        vpaddd  %xmm0,%xmm8,%xmm8
        vmovdqu 128-128(%rdi),%xmm0
        vpand   %xmm6,%xmm12,%xmm12
        vpaddd  %xmm1,%xmm9,%xmm9
        vmovdqu 160-128(%rdi),%xmm1
        vpand   %xmm6,%xmm13,%xmm13
        vpaddd  %xmm2,%xmm10,%xmm10
        vmovdqu 192-128(%rdi),%xmm2
        vpand   %xmm6,%xmm14,%xmm14
        vpaddd  %xmm5,%xmm11,%xmm11
        vmovdqu 224-128(%rdi),%xmm5
        vpand   %xmm6,%xmm15,%xmm15
        vpaddd  %xmm0,%xmm12,%xmm12
        vpaddd  %xmm1,%xmm13,%xmm13
        vmovdqu %xmm8,0-128(%rdi)
        vpaddd  %xmm2,%xmm14,%xmm14
        vmovdqu %xmm9,32-128(%rdi)
        vpaddd  %xmm5,%xmm15,%xmm15
        vmovdqu %xmm10,64-128(%rdi)
        vmovdqu %xmm11,96-128(%rdi)
        vmovdqu %xmm12,128-128(%rdi)
        vmovdqu %xmm13,160-128(%rdi)
        vmovdqu %xmm14,192-128(%rdi)
        vmovdqu %xmm15,224-128(%rdi)

        vmovdqu %xmm7,(%rbx)
        vmovdqu .Lpbswap(%rip),%xmm6
        decl    %edx
        jnz     .Loop_avx

        movl    280(%rsp),%edx
        leaq    16(%rdi),%rdi
        leaq    64(%rsi),%rsi
        decl    %edx
        jnz     .Loop_grande_avx

.Ldone_avx:
        movq    272(%rsp),%rax
.cfi_def_cfa    %rax,8
        vzeroupper
        movq    -16(%rax),%rbp
.cfi_restore    %rbp
        movq    -8(%rax),%rbx
.cfi_restore    %rbx
        leaq    (%rax),%rsp
.cfi_def_cfa_register   %rsp
.Lepilogue_avx:
        .byte   0xf3,0xc3
.cfi_endproc
.size   sha256_multi_block_avx,.-sha256_multi_block_avx
.type   sha256_multi_block_avx2,@function
.align  32
sha256_multi_block_avx2:
.cfi_startproc
_avx2_shortcut:
        movq    %rsp,%rax
.cfi_def_cfa_register   %rax
        pushq   %rbx
.cfi_offset     %rbx,-16
        pushq   %rbp
.cfi_offset     %rbp,-24
        pushq   %r12
.cfi_offset     %r12,-32
        pushq   %r13
.cfi_offset     %r13,-40
        pushq   %r14
.cfi_offset     %r14,-48
        pushq   %r15
.cfi_offset     %r15,-56
        subq    $576,%rsp
        andq    $-256,%rsp
        movq    %rax,544(%rsp)
.cfi_escape     0x0f,0x06,0x77,0xa0,0x04,0x06,0x23,0x08
.Lbody_avx2:
        leaq    K256+128(%rip),%rbp
        leaq    128(%rdi),%rdi

.Loop_grande_avx2:
        movl    %edx,552(%rsp)
        xorl    %edx,%edx
        leaq    512(%rsp),%rbx

        movq    0(%rsi),%r12

        movl    8(%rsi),%ecx
        cmpl    %edx,%ecx
        cmovgl  %ecx,%edx
        testl   %ecx,%ecx
        movl    %ecx,0(%rbx)
        cmovleq %rbp,%r12

        movq    16(%rsi),%r13

        movl    24(%rsi),%ecx
        cmpl    %edx,%ecx
        cmovgl  %ecx,%edx
        testl   %ecx,%ecx
        movl    %ecx,4(%rbx)
        cmovleq %rbp,%r13

        movq    32(%rsi),%r14

        movl    40(%rsi),%ecx
        cmpl    %edx,%ecx
        cmovgl  %ecx,%edx
        testl   %ecx,%ecx
        movl    %ecx,8(%rbx)
        cmovleq %rbp,%r14

        movq    48(%rsi),%r15

        movl    56(%rsi),%ecx
        cmpl    %edx,%ecx
        cmovgl  %ecx,%edx
        testl   %ecx,%ecx
        movl    %ecx,12(%rbx)
        cmovleq %rbp,%r15

        movq    64(%rsi),%r8

        movl    72(%rsi),%ecx
        cmpl    %edx,%ecx
        cmovgl  %ecx,%edx
        testl   %ecx,%ecx
        movl    %ecx,16(%rbx)
        cmovleq %rbp,%r8

        movq    80(%rsi),%r9

        movl    88(%rsi),%ecx
        cmpl    %edx,%ecx
        cmovgl  %ecx,%edx
        testl   %ecx,%ecx
        movl    %ecx,20(%rbx)
        cmovleq %rbp,%r9

        movq    96(%rsi),%r10

        movl    104(%rsi),%ecx
        cmpl    %edx,%ecx
        cmovgl  %ecx,%edx
        testl   %ecx,%ecx
        movl    %ecx,24(%rbx)
        cmovleq %rbp,%r10

        movq    112(%rsi),%r11

        movl    120(%rsi),%ecx
        cmpl    %edx,%ecx
        cmovgl  %ecx,%edx
        testl   %ecx,%ecx
        movl    %ecx,28(%rbx)
        cmovleq %rbp,%r11
        vmovdqu 0-128(%rdi),%ymm8
        leaq    128(%rsp),%rax
        vmovdqu 32-128(%rdi),%ymm9
        leaq    256+128(%rsp),%rbx
        vmovdqu 64-128(%rdi),%ymm10
        vmovdqu 96-128(%rdi),%ymm11
        vmovdqu 128-128(%rdi),%ymm12
        vmovdqu 160-128(%rdi),%ymm13
        vmovdqu 192-128(%rdi),%ymm14
        vmovdqu 224-128(%rdi),%ymm15
        vmovdqu .Lpbswap(%rip),%ymm6
        jmp     .Loop_avx2

.align  32
.Loop_avx2:
        vpxor   %ymm9,%ymm10,%ymm4
        vmovd   0(%r12),%xmm5
        vmovd   0(%r8),%xmm0
        vmovd   0(%r13),%xmm1
        vmovd   0(%r9),%xmm2
        vpinsrd $1,0(%r14),%xmm5,%xmm5
        vpinsrd $1,0(%r10),%xmm0,%xmm0
        vpinsrd $1,0(%r15),%xmm1,%xmm1
        vpunpckldq      %ymm1,%ymm5,%ymm5
        vpinsrd $1,0(%r11),%xmm2,%xmm2
        vpunpckldq      %ymm2,%ymm0,%ymm0
        vinserti128     $1,%xmm0,%ymm5,%ymm5
        vpshufb %ymm6,%ymm5,%ymm5
        vpsrld  $6,%ymm12,%ymm7
        vpslld  $26,%ymm12,%ymm2
        vmovdqu %ymm5,0-128(%rax)
        vpaddd  %ymm15,%ymm5,%ymm5

        vpsrld  $11,%ymm12,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $21,%ymm12,%ymm2
        vpaddd  -128(%rbp),%ymm5,%ymm5
        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $25,%ymm12,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $7,%ymm12,%ymm2
        vpandn  %ymm14,%ymm12,%ymm0
        vpand   %ymm13,%ymm12,%ymm3

        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $2,%ymm8,%ymm15
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $30,%ymm8,%ymm1
        vpxor   %ymm3,%ymm0,%ymm0
        vpxor   %ymm8,%ymm9,%ymm3

        vpxor   %ymm1,%ymm15,%ymm15
        vpaddd  %ymm7,%ymm5,%ymm5

        vpsrld  $13,%ymm8,%ymm1

        vpslld  $19,%ymm8,%ymm2
        vpaddd  %ymm0,%ymm5,%ymm5
        vpand   %ymm3,%ymm4,%ymm4

        vpxor   %ymm1,%ymm15,%ymm7

        vpsrld  $22,%ymm8,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $10,%ymm8,%ymm2
        vpxor   %ymm4,%ymm9,%ymm15
        vpaddd  %ymm5,%ymm11,%ymm11

        vpxor   %ymm1,%ymm7,%ymm7
        vpxor   %ymm2,%ymm7,%ymm7

        vpaddd  %ymm5,%ymm15,%ymm15
        vpaddd  %ymm7,%ymm15,%ymm15
        vmovd   4(%r12),%xmm5
        vmovd   4(%r8),%xmm0
        vmovd   4(%r13),%xmm1
        vmovd   4(%r9),%xmm2
        vpinsrd $1,4(%r14),%xmm5,%xmm5
        vpinsrd $1,4(%r10),%xmm0,%xmm0
        vpinsrd $1,4(%r15),%xmm1,%xmm1
        vpunpckldq      %ymm1,%ymm5,%ymm5
        vpinsrd $1,4(%r11),%xmm2,%xmm2
        vpunpckldq      %ymm2,%ymm0,%ymm0
        vinserti128     $1,%xmm0,%ymm5,%ymm5
        vpshufb %ymm6,%ymm5,%ymm5
        vpsrld  $6,%ymm11,%ymm7
        vpslld  $26,%ymm11,%ymm2
        vmovdqu %ymm5,32-128(%rax)
        vpaddd  %ymm14,%ymm5,%ymm5

        vpsrld  $11,%ymm11,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $21,%ymm11,%ymm2
        vpaddd  -96(%rbp),%ymm5,%ymm5
        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $25,%ymm11,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $7,%ymm11,%ymm2
        vpandn  %ymm13,%ymm11,%ymm0
        vpand   %ymm12,%ymm11,%ymm4

        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $2,%ymm15,%ymm14
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $30,%ymm15,%ymm1
        vpxor   %ymm4,%ymm0,%ymm0
        vpxor   %ymm15,%ymm8,%ymm4

        vpxor   %ymm1,%ymm14,%ymm14
        vpaddd  %ymm7,%ymm5,%ymm5

        vpsrld  $13,%ymm15,%ymm1

        vpslld  $19,%ymm15,%ymm2
        vpaddd  %ymm0,%ymm5,%ymm5
        vpand   %ymm4,%ymm3,%ymm3

        vpxor   %ymm1,%ymm14,%ymm7

        vpsrld  $22,%ymm15,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $10,%ymm15,%ymm2
        vpxor   %ymm3,%ymm8,%ymm14
        vpaddd  %ymm5,%ymm10,%ymm10

        vpxor   %ymm1,%ymm7,%ymm7
        vpxor   %ymm2,%ymm7,%ymm7

        vpaddd  %ymm5,%ymm14,%ymm14
        vpaddd  %ymm7,%ymm14,%ymm14
        vmovd   8(%r12),%xmm5
        vmovd   8(%r8),%xmm0
        vmovd   8(%r13),%xmm1
        vmovd   8(%r9),%xmm2
        vpinsrd $1,8(%r14),%xmm5,%xmm5
        vpinsrd $1,8(%r10),%xmm0,%xmm0
        vpinsrd $1,8(%r15),%xmm1,%xmm1
        vpunpckldq      %ymm1,%ymm5,%ymm5
        vpinsrd $1,8(%r11),%xmm2,%xmm2
        vpunpckldq      %ymm2,%ymm0,%ymm0
        vinserti128     $1,%xmm0,%ymm5,%ymm5
        vpshufb %ymm6,%ymm5,%ymm5
        vpsrld  $6,%ymm10,%ymm7
        vpslld  $26,%ymm10,%ymm2
        vmovdqu %ymm5,64-128(%rax)
        vpaddd  %ymm13,%ymm5,%ymm5

        vpsrld  $11,%ymm10,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $21,%ymm10,%ymm2
        vpaddd  -64(%rbp),%ymm5,%ymm5
        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $25,%ymm10,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $7,%ymm10,%ymm2
        vpandn  %ymm12,%ymm10,%ymm0
        vpand   %ymm11,%ymm10,%ymm3

        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $2,%ymm14,%ymm13
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $30,%ymm14,%ymm1
        vpxor   %ymm3,%ymm0,%ymm0
        vpxor   %ymm14,%ymm15,%ymm3

        vpxor   %ymm1,%ymm13,%ymm13
        vpaddd  %ymm7,%ymm5,%ymm5

        vpsrld  $13,%ymm14,%ymm1

        vpslld  $19,%ymm14,%ymm2
        vpaddd  %ymm0,%ymm5,%ymm5
        vpand   %ymm3,%ymm4,%ymm4

        vpxor   %ymm1,%ymm13,%ymm7

        vpsrld  $22,%ymm14,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $10,%ymm14,%ymm2
        vpxor   %ymm4,%ymm15,%ymm13
        vpaddd  %ymm5,%ymm9,%ymm9

        vpxor   %ymm1,%ymm7,%ymm7
        vpxor   %ymm2,%ymm7,%ymm7

        vpaddd  %ymm5,%ymm13,%ymm13
        vpaddd  %ymm7,%ymm13,%ymm13
        vmovd   12(%r12),%xmm5
        vmovd   12(%r8),%xmm0
        vmovd   12(%r13),%xmm1
        vmovd   12(%r9),%xmm2
        vpinsrd $1,12(%r14),%xmm5,%xmm5
        vpinsrd $1,12(%r10),%xmm0,%xmm0
        vpinsrd $1,12(%r15),%xmm1,%xmm1
        vpunpckldq      %ymm1,%ymm5,%ymm5
        vpinsrd $1,12(%r11),%xmm2,%xmm2
        vpunpckldq      %ymm2,%ymm0,%ymm0
        vinserti128     $1,%xmm0,%ymm5,%ymm5
        vpshufb %ymm6,%ymm5,%ymm5
        vpsrld  $6,%ymm9,%ymm7
        vpslld  $26,%ymm9,%ymm2
        vmovdqu %ymm5,96-128(%rax)
        vpaddd  %ymm12,%ymm5,%ymm5

        vpsrld  $11,%ymm9,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $21,%ymm9,%ymm2
        vpaddd  -32(%rbp),%ymm5,%ymm5
        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $25,%ymm9,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $7,%ymm9,%ymm2
        vpandn  %ymm11,%ymm9,%ymm0
        vpand   %ymm10,%ymm9,%ymm4

        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $2,%ymm13,%ymm12
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $30,%ymm13,%ymm1
        vpxor   %ymm4,%ymm0,%ymm0
        vpxor   %ymm13,%ymm14,%ymm4

        vpxor   %ymm1,%ymm12,%ymm12
        vpaddd  %ymm7,%ymm5,%ymm5

        vpsrld  $13,%ymm13,%ymm1

        vpslld  $19,%ymm13,%ymm2
        vpaddd  %ymm0,%ymm5,%ymm5
        vpand   %ymm4,%ymm3,%ymm3

        vpxor   %ymm1,%ymm12,%ymm7

        vpsrld  $22,%ymm13,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $10,%ymm13,%ymm2
        vpxor   %ymm3,%ymm14,%ymm12
        vpaddd  %ymm5,%ymm8,%ymm8

        vpxor   %ymm1,%ymm7,%ymm7
        vpxor   %ymm2,%ymm7,%ymm7

        vpaddd  %ymm5,%ymm12,%ymm12
        vpaddd  %ymm7,%ymm12,%ymm12
        vmovd   16(%r12),%xmm5
        vmovd   16(%r8),%xmm0
        vmovd   16(%r13),%xmm1
        vmovd   16(%r9),%xmm2
        vpinsrd $1,16(%r14),%xmm5,%xmm5
        vpinsrd $1,16(%r10),%xmm0,%xmm0
        vpinsrd $1,16(%r15),%xmm1,%xmm1
        vpunpckldq      %ymm1,%ymm5,%ymm5
        vpinsrd $1,16(%r11),%xmm2,%xmm2
        vpunpckldq      %ymm2,%ymm0,%ymm0
        vinserti128     $1,%xmm0,%ymm5,%ymm5
        vpshufb %ymm6,%ymm5,%ymm5
        vpsrld  $6,%ymm8,%ymm7
        vpslld  $26,%ymm8,%ymm2
        vmovdqu %ymm5,128-128(%rax)
        vpaddd  %ymm11,%ymm5,%ymm5

        vpsrld  $11,%ymm8,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $21,%ymm8,%ymm2
        vpaddd  0(%rbp),%ymm5,%ymm5
        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $25,%ymm8,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $7,%ymm8,%ymm2
        vpandn  %ymm10,%ymm8,%ymm0
        vpand   %ymm9,%ymm8,%ymm3

        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $2,%ymm12,%ymm11
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $30,%ymm12,%ymm1
        vpxor   %ymm3,%ymm0,%ymm0
        vpxor   %ymm12,%ymm13,%ymm3

        vpxor   %ymm1,%ymm11,%ymm11
        vpaddd  %ymm7,%ymm5,%ymm5

        vpsrld  $13,%ymm12,%ymm1

        vpslld  $19,%ymm12,%ymm2
        vpaddd  %ymm0,%ymm5,%ymm5
        vpand   %ymm3,%ymm4,%ymm4

        vpxor   %ymm1,%ymm11,%ymm7

        vpsrld  $22,%ymm12,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $10,%ymm12,%ymm2
        vpxor   %ymm4,%ymm13,%ymm11
        vpaddd  %ymm5,%ymm15,%ymm15

        vpxor   %ymm1,%ymm7,%ymm7
        vpxor   %ymm2,%ymm7,%ymm7

        vpaddd  %ymm5,%ymm11,%ymm11
        vpaddd  %ymm7,%ymm11,%ymm11
        vmovd   20(%r12),%xmm5
        vmovd   20(%r8),%xmm0
        vmovd   20(%r13),%xmm1
        vmovd   20(%r9),%xmm2
        vpinsrd $1,20(%r14),%xmm5,%xmm5
        vpinsrd $1,20(%r10),%xmm0,%xmm0
        vpinsrd $1,20(%r15),%xmm1,%xmm1
        vpunpckldq      %ymm1,%ymm5,%ymm5
        vpinsrd $1,20(%r11),%xmm2,%xmm2
        vpunpckldq      %ymm2,%ymm0,%ymm0
        vinserti128     $1,%xmm0,%ymm5,%ymm5
        vpshufb %ymm6,%ymm5,%ymm5
        vpsrld  $6,%ymm15,%ymm7
        vpslld  $26,%ymm15,%ymm2
        vmovdqu %ymm5,160-128(%rax)
        vpaddd  %ymm10,%ymm5,%ymm5

        vpsrld  $11,%ymm15,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $21,%ymm15,%ymm2
        vpaddd  32(%rbp),%ymm5,%ymm5
        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $25,%ymm15,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $7,%ymm15,%ymm2
        vpandn  %ymm9,%ymm15,%ymm0
        vpand   %ymm8,%ymm15,%ymm4

        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $2,%ymm11,%ymm10
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $30,%ymm11,%ymm1
        vpxor   %ymm4,%ymm0,%ymm0
        vpxor   %ymm11,%ymm12,%ymm4

        vpxor   %ymm1,%ymm10,%ymm10
        vpaddd  %ymm7,%ymm5,%ymm5

        vpsrld  $13,%ymm11,%ymm1

        vpslld  $19,%ymm11,%ymm2
        vpaddd  %ymm0,%ymm5,%ymm5
        vpand   %ymm4,%ymm3,%ymm3

        vpxor   %ymm1,%ymm10,%ymm7

        vpsrld  $22,%ymm11,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $10,%ymm11,%ymm2
        vpxor   %ymm3,%ymm12,%ymm10
        vpaddd  %ymm5,%ymm14,%ymm14

        vpxor   %ymm1,%ymm7,%ymm7
        vpxor   %ymm2,%ymm7,%ymm7

        vpaddd  %ymm5,%ymm10,%ymm10
        vpaddd  %ymm7,%ymm10,%ymm10
        vmovd   24(%r12),%xmm5
        vmovd   24(%r8),%xmm0
        vmovd   24(%r13),%xmm1
        vmovd   24(%r9),%xmm2
        vpinsrd $1,24(%r14),%xmm5,%xmm5
        vpinsrd $1,24(%r10),%xmm0,%xmm0
        vpinsrd $1,24(%r15),%xmm1,%xmm1
        vpunpckldq      %ymm1,%ymm5,%ymm5
        vpinsrd $1,24(%r11),%xmm2,%xmm2
        vpunpckldq      %ymm2,%ymm0,%ymm0
        vinserti128     $1,%xmm0,%ymm5,%ymm5
        vpshufb %ymm6,%ymm5,%ymm5
        vpsrld  $6,%ymm14,%ymm7
        vpslld  $26,%ymm14,%ymm2
        vmovdqu %ymm5,192-128(%rax)
        vpaddd  %ymm9,%ymm5,%ymm5

        vpsrld  $11,%ymm14,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $21,%ymm14,%ymm2
        vpaddd  64(%rbp),%ymm5,%ymm5
        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $25,%ymm14,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $7,%ymm14,%ymm2
        vpandn  %ymm8,%ymm14,%ymm0
        vpand   %ymm15,%ymm14,%ymm3

        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $2,%ymm10,%ymm9
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $30,%ymm10,%ymm1
        vpxor   %ymm3,%ymm0,%ymm0
        vpxor   %ymm10,%ymm11,%ymm3

        vpxor   %ymm1,%ymm9,%ymm9
        vpaddd  %ymm7,%ymm5,%ymm5

        vpsrld  $13,%ymm10,%ymm1

        vpslld  $19,%ymm10,%ymm2
        vpaddd  %ymm0,%ymm5,%ymm5
        vpand   %ymm3,%ymm4,%ymm4

        vpxor   %ymm1,%ymm9,%ymm7

        vpsrld  $22,%ymm10,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $10,%ymm10,%ymm2
        vpxor   %ymm4,%ymm11,%ymm9
        vpaddd  %ymm5,%ymm13,%ymm13

        vpxor   %ymm1,%ymm7,%ymm7
        vpxor   %ymm2,%ymm7,%ymm7

        vpaddd  %ymm5,%ymm9,%ymm9
        vpaddd  %ymm7,%ymm9,%ymm9
        vmovd   28(%r12),%xmm5
        vmovd   28(%r8),%xmm0
        vmovd   28(%r13),%xmm1
        vmovd   28(%r9),%xmm2
        vpinsrd $1,28(%r14),%xmm5,%xmm5
        vpinsrd $1,28(%r10),%xmm0,%xmm0
        vpinsrd $1,28(%r15),%xmm1,%xmm1
        vpunpckldq      %ymm1,%ymm5,%ymm5
        vpinsrd $1,28(%r11),%xmm2,%xmm2
        vpunpckldq      %ymm2,%ymm0,%ymm0
        vinserti128     $1,%xmm0,%ymm5,%ymm5
        vpshufb %ymm6,%ymm5,%ymm5
        vpsrld  $6,%ymm13,%ymm7
        vpslld  $26,%ymm13,%ymm2
        vmovdqu %ymm5,224-128(%rax)
        vpaddd  %ymm8,%ymm5,%ymm5

        vpsrld  $11,%ymm13,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $21,%ymm13,%ymm2
        vpaddd  96(%rbp),%ymm5,%ymm5
        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $25,%ymm13,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $7,%ymm13,%ymm2
        vpandn  %ymm15,%ymm13,%ymm0
        vpand   %ymm14,%ymm13,%ymm4

        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $2,%ymm9,%ymm8
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $30,%ymm9,%ymm1
        vpxor   %ymm4,%ymm0,%ymm0
        vpxor   %ymm9,%ymm10,%ymm4

        vpxor   %ymm1,%ymm8,%ymm8
        vpaddd  %ymm7,%ymm5,%ymm5

        vpsrld  $13,%ymm9,%ymm1

        vpslld  $19,%ymm9,%ymm2
        vpaddd  %ymm0,%ymm5,%ymm5
        vpand   %ymm4,%ymm3,%ymm3

        vpxor   %ymm1,%ymm8,%ymm7

        vpsrld  $22,%ymm9,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $10,%ymm9,%ymm2
        vpxor   %ymm3,%ymm10,%ymm8
        vpaddd  %ymm5,%ymm12,%ymm12

        vpxor   %ymm1,%ymm7,%ymm7
        vpxor   %ymm2,%ymm7,%ymm7

        vpaddd  %ymm5,%ymm8,%ymm8
        vpaddd  %ymm7,%ymm8,%ymm8
        addq    $256,%rbp
        vmovd   32(%r12),%xmm5
        vmovd   32(%r8),%xmm0
        vmovd   32(%r13),%xmm1
        vmovd   32(%r9),%xmm2
        vpinsrd $1,32(%r14),%xmm5,%xmm5
        vpinsrd $1,32(%r10),%xmm0,%xmm0
        vpinsrd $1,32(%r15),%xmm1,%xmm1
        vpunpckldq      %ymm1,%ymm5,%ymm5
        vpinsrd $1,32(%r11),%xmm2,%xmm2
        vpunpckldq      %ymm2,%ymm0,%ymm0
        vinserti128     $1,%xmm0,%ymm5,%ymm5
        vpshufb %ymm6,%ymm5,%ymm5
        vpsrld  $6,%ymm12,%ymm7
        vpslld  $26,%ymm12,%ymm2
        vmovdqu %ymm5,256-256-128(%rbx)
        vpaddd  %ymm15,%ymm5,%ymm5

        vpsrld  $11,%ymm12,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $21,%ymm12,%ymm2
        vpaddd  -128(%rbp),%ymm5,%ymm5
        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $25,%ymm12,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $7,%ymm12,%ymm2
        vpandn  %ymm14,%ymm12,%ymm0
        vpand   %ymm13,%ymm12,%ymm3

        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $2,%ymm8,%ymm15
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $30,%ymm8,%ymm1
        vpxor   %ymm3,%ymm0,%ymm0
        vpxor   %ymm8,%ymm9,%ymm3

        vpxor   %ymm1,%ymm15,%ymm15
        vpaddd  %ymm7,%ymm5,%ymm5

        vpsrld  $13,%ymm8,%ymm1

        vpslld  $19,%ymm8,%ymm2
        vpaddd  %ymm0,%ymm5,%ymm5
        vpand   %ymm3,%ymm4,%ymm4

        vpxor   %ymm1,%ymm15,%ymm7

        vpsrld  $22,%ymm8,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $10,%ymm8,%ymm2
        vpxor   %ymm4,%ymm9,%ymm15
        vpaddd  %ymm5,%ymm11,%ymm11

        vpxor   %ymm1,%ymm7,%ymm7
        vpxor   %ymm2,%ymm7,%ymm7

        vpaddd  %ymm5,%ymm15,%ymm15
        vpaddd  %ymm7,%ymm15,%ymm15
        vmovd   36(%r12),%xmm5
        vmovd   36(%r8),%xmm0
        vmovd   36(%r13),%xmm1
        vmovd   36(%r9),%xmm2
        vpinsrd $1,36(%r14),%xmm5,%xmm5
        vpinsrd $1,36(%r10),%xmm0,%xmm0
        vpinsrd $1,36(%r15),%xmm1,%xmm1
        vpunpckldq      %ymm1,%ymm5,%ymm5
        vpinsrd $1,36(%r11),%xmm2,%xmm2
        vpunpckldq      %ymm2,%ymm0,%ymm0
        vinserti128     $1,%xmm0,%ymm5,%ymm5
        vpshufb %ymm6,%ymm5,%ymm5
        vpsrld  $6,%ymm11,%ymm7
        vpslld  $26,%ymm11,%ymm2
        vmovdqu %ymm5,288-256-128(%rbx)
        vpaddd  %ymm14,%ymm5,%ymm5

        vpsrld  $11,%ymm11,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $21,%ymm11,%ymm2
        vpaddd  -96(%rbp),%ymm5,%ymm5
        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $25,%ymm11,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $7,%ymm11,%ymm2
        vpandn  %ymm13,%ymm11,%ymm0
        vpand   %ymm12,%ymm11,%ymm4

        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $2,%ymm15,%ymm14
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $30,%ymm15,%ymm1
        vpxor   %ymm4,%ymm0,%ymm0
        vpxor   %ymm15,%ymm8,%ymm4

        vpxor   %ymm1,%ymm14,%ymm14
        vpaddd  %ymm7,%ymm5,%ymm5

        vpsrld  $13,%ymm15,%ymm1

        vpslld  $19,%ymm15,%ymm2
        vpaddd  %ymm0,%ymm5,%ymm5
        vpand   %ymm4,%ymm3,%ymm3

        vpxor   %ymm1,%ymm14,%ymm7

        vpsrld  $22,%ymm15,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $10,%ymm15,%ymm2
        vpxor   %ymm3,%ymm8,%ymm14
        vpaddd  %ymm5,%ymm10,%ymm10

        vpxor   %ymm1,%ymm7,%ymm7
        vpxor   %ymm2,%ymm7,%ymm7

        vpaddd  %ymm5,%ymm14,%ymm14
        vpaddd  %ymm7,%ymm14,%ymm14
        vmovd   40(%r12),%xmm5
        vmovd   40(%r8),%xmm0
        vmovd   40(%r13),%xmm1
        vmovd   40(%r9),%xmm2
        vpinsrd $1,40(%r14),%xmm5,%xmm5
        vpinsrd $1,40(%r10),%xmm0,%xmm0
        vpinsrd $1,40(%r15),%xmm1,%xmm1
        vpunpckldq      %ymm1,%ymm5,%ymm5
        vpinsrd $1,40(%r11),%xmm2,%xmm2
        vpunpckldq      %ymm2,%ymm0,%ymm0
        vinserti128     $1,%xmm0,%ymm5,%ymm5
        vpshufb %ymm6,%ymm5,%ymm5
        vpsrld  $6,%ymm10,%ymm7
        vpslld  $26,%ymm10,%ymm2
        vmovdqu %ymm5,320-256-128(%rbx)
        vpaddd  %ymm13,%ymm5,%ymm5

        vpsrld  $11,%ymm10,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $21,%ymm10,%ymm2
        vpaddd  -64(%rbp),%ymm5,%ymm5
        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $25,%ymm10,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $7,%ymm10,%ymm2
        vpandn  %ymm12,%ymm10,%ymm0
        vpand   %ymm11,%ymm10,%ymm3

        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $2,%ymm14,%ymm13
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $30,%ymm14,%ymm1
        vpxor   %ymm3,%ymm0,%ymm0
        vpxor   %ymm14,%ymm15,%ymm3

        vpxor   %ymm1,%ymm13,%ymm13
        vpaddd  %ymm7,%ymm5,%ymm5

        vpsrld  $13,%ymm14,%ymm1

        vpslld  $19,%ymm14,%ymm2
        vpaddd  %ymm0,%ymm5,%ymm5
        vpand   %ymm3,%ymm4,%ymm4

        vpxor   %ymm1,%ymm13,%ymm7

        vpsrld  $22,%ymm14,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $10,%ymm14,%ymm2
        vpxor   %ymm4,%ymm15,%ymm13
        vpaddd  %ymm5,%ymm9,%ymm9

        vpxor   %ymm1,%ymm7,%ymm7
        vpxor   %ymm2,%ymm7,%ymm7

        vpaddd  %ymm5,%ymm13,%ymm13
        vpaddd  %ymm7,%ymm13,%ymm13
        vmovd   44(%r12),%xmm5
        vmovd   44(%r8),%xmm0
        vmovd   44(%r13),%xmm1
        vmovd   44(%r9),%xmm2
        vpinsrd $1,44(%r14),%xmm5,%xmm5
        vpinsrd $1,44(%r10),%xmm0,%xmm0
        vpinsrd $1,44(%r15),%xmm1,%xmm1
        vpunpckldq      %ymm1,%ymm5,%ymm5
        vpinsrd $1,44(%r11),%xmm2,%xmm2
        vpunpckldq      %ymm2,%ymm0,%ymm0
        vinserti128     $1,%xmm0,%ymm5,%ymm5
        vpshufb %ymm6,%ymm5,%ymm5
        vpsrld  $6,%ymm9,%ymm7
        vpslld  $26,%ymm9,%ymm2
        vmovdqu %ymm5,352-256-128(%rbx)
        vpaddd  %ymm12,%ymm5,%ymm5

        vpsrld  $11,%ymm9,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $21,%ymm9,%ymm2
        vpaddd  -32(%rbp),%ymm5,%ymm5
        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $25,%ymm9,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $7,%ymm9,%ymm2
        vpandn  %ymm11,%ymm9,%ymm0
        vpand   %ymm10,%ymm9,%ymm4

        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $2,%ymm13,%ymm12
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $30,%ymm13,%ymm1
        vpxor   %ymm4,%ymm0,%ymm0
        vpxor   %ymm13,%ymm14,%ymm4

        vpxor   %ymm1,%ymm12,%ymm12
        vpaddd  %ymm7,%ymm5,%ymm5

        vpsrld  $13,%ymm13,%ymm1

        vpslld  $19,%ymm13,%ymm2
        vpaddd  %ymm0,%ymm5,%ymm5
        vpand   %ymm4,%ymm3,%ymm3

        vpxor   %ymm1,%ymm12,%ymm7

        vpsrld  $22,%ymm13,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $10,%ymm13,%ymm2
        vpxor   %ymm3,%ymm14,%ymm12
        vpaddd  %ymm5,%ymm8,%ymm8

        vpxor   %ymm1,%ymm7,%ymm7
        vpxor   %ymm2,%ymm7,%ymm7

        vpaddd  %ymm5,%ymm12,%ymm12
        vpaddd  %ymm7,%ymm12,%ymm12
        vmovd   48(%r12),%xmm5
        vmovd   48(%r8),%xmm0
        vmovd   48(%r13),%xmm1
        vmovd   48(%r9),%xmm2
        vpinsrd $1,48(%r14),%xmm5,%xmm5
        vpinsrd $1,48(%r10),%xmm0,%xmm0
        vpinsrd $1,48(%r15),%xmm1,%xmm1
        vpunpckldq      %ymm1,%ymm5,%ymm5
        vpinsrd $1,48(%r11),%xmm2,%xmm2
        vpunpckldq      %ymm2,%ymm0,%ymm0
        vinserti128     $1,%xmm0,%ymm5,%ymm5
        vpshufb %ymm6,%ymm5,%ymm5
        vpsrld  $6,%ymm8,%ymm7
        vpslld  $26,%ymm8,%ymm2
        vmovdqu %ymm5,384-256-128(%rbx)
        vpaddd  %ymm11,%ymm5,%ymm5

        vpsrld  $11,%ymm8,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $21,%ymm8,%ymm2
        vpaddd  0(%rbp),%ymm5,%ymm5
        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $25,%ymm8,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $7,%ymm8,%ymm2
        vpandn  %ymm10,%ymm8,%ymm0
        vpand   %ymm9,%ymm8,%ymm3

        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $2,%ymm12,%ymm11
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $30,%ymm12,%ymm1
        vpxor   %ymm3,%ymm0,%ymm0
        vpxor   %ymm12,%ymm13,%ymm3

        vpxor   %ymm1,%ymm11,%ymm11
        vpaddd  %ymm7,%ymm5,%ymm5

        vpsrld  $13,%ymm12,%ymm1

        vpslld  $19,%ymm12,%ymm2
        vpaddd  %ymm0,%ymm5,%ymm5
        vpand   %ymm3,%ymm4,%ymm4

        vpxor   %ymm1,%ymm11,%ymm7

        vpsrld  $22,%ymm12,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $10,%ymm12,%ymm2
        vpxor   %ymm4,%ymm13,%ymm11
        vpaddd  %ymm5,%ymm15,%ymm15

        vpxor   %ymm1,%ymm7,%ymm7
        vpxor   %ymm2,%ymm7,%ymm7

        vpaddd  %ymm5,%ymm11,%ymm11
        vpaddd  %ymm7,%ymm11,%ymm11
        vmovd   52(%r12),%xmm5
        vmovd   52(%r8),%xmm0
        vmovd   52(%r13),%xmm1
        vmovd   52(%r9),%xmm2
        vpinsrd $1,52(%r14),%xmm5,%xmm5
        vpinsrd $1,52(%r10),%xmm0,%xmm0
        vpinsrd $1,52(%r15),%xmm1,%xmm1
        vpunpckldq      %ymm1,%ymm5,%ymm5
        vpinsrd $1,52(%r11),%xmm2,%xmm2
        vpunpckldq      %ymm2,%ymm0,%ymm0
        vinserti128     $1,%xmm0,%ymm5,%ymm5
        vpshufb %ymm6,%ymm5,%ymm5
        vpsrld  $6,%ymm15,%ymm7
        vpslld  $26,%ymm15,%ymm2
        vmovdqu %ymm5,416-256-128(%rbx)
        vpaddd  %ymm10,%ymm5,%ymm5

        vpsrld  $11,%ymm15,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $21,%ymm15,%ymm2
        vpaddd  32(%rbp),%ymm5,%ymm5
        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $25,%ymm15,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $7,%ymm15,%ymm2
        vpandn  %ymm9,%ymm15,%ymm0
        vpand   %ymm8,%ymm15,%ymm4

        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $2,%ymm11,%ymm10
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $30,%ymm11,%ymm1
        vpxor   %ymm4,%ymm0,%ymm0
        vpxor   %ymm11,%ymm12,%ymm4

        vpxor   %ymm1,%ymm10,%ymm10
        vpaddd  %ymm7,%ymm5,%ymm5

        vpsrld  $13,%ymm11,%ymm1

        vpslld  $19,%ymm11,%ymm2
        vpaddd  %ymm0,%ymm5,%ymm5
        vpand   %ymm4,%ymm3,%ymm3

        vpxor   %ymm1,%ymm10,%ymm7

        vpsrld  $22,%ymm11,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $10,%ymm11,%ymm2
        vpxor   %ymm3,%ymm12,%ymm10
        vpaddd  %ymm5,%ymm14,%ymm14

        vpxor   %ymm1,%ymm7,%ymm7
        vpxor   %ymm2,%ymm7,%ymm7

        vpaddd  %ymm5,%ymm10,%ymm10
        vpaddd  %ymm7,%ymm10,%ymm10
        vmovd   56(%r12),%xmm5
        vmovd   56(%r8),%xmm0
        vmovd   56(%r13),%xmm1
        vmovd   56(%r9),%xmm2
        vpinsrd $1,56(%r14),%xmm5,%xmm5
        vpinsrd $1,56(%r10),%xmm0,%xmm0
        vpinsrd $1,56(%r15),%xmm1,%xmm1
        vpunpckldq      %ymm1,%ymm5,%ymm5
        vpinsrd $1,56(%r11),%xmm2,%xmm2
        vpunpckldq      %ymm2,%ymm0,%ymm0
        vinserti128     $1,%xmm0,%ymm5,%ymm5
        vpshufb %ymm6,%ymm5,%ymm5
        vpsrld  $6,%ymm14,%ymm7
        vpslld  $26,%ymm14,%ymm2
        vmovdqu %ymm5,448-256-128(%rbx)
        vpaddd  %ymm9,%ymm5,%ymm5

        vpsrld  $11,%ymm14,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $21,%ymm14,%ymm2
        vpaddd  64(%rbp),%ymm5,%ymm5
        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $25,%ymm14,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $7,%ymm14,%ymm2
        vpandn  %ymm8,%ymm14,%ymm0
        vpand   %ymm15,%ymm14,%ymm3

        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $2,%ymm10,%ymm9
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $30,%ymm10,%ymm1
        vpxor   %ymm3,%ymm0,%ymm0
        vpxor   %ymm10,%ymm11,%ymm3

        vpxor   %ymm1,%ymm9,%ymm9
        vpaddd  %ymm7,%ymm5,%ymm5

        vpsrld  $13,%ymm10,%ymm1

        vpslld  $19,%ymm10,%ymm2
        vpaddd  %ymm0,%ymm5,%ymm5
        vpand   %ymm3,%ymm4,%ymm4

        vpxor   %ymm1,%ymm9,%ymm7

        vpsrld  $22,%ymm10,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $10,%ymm10,%ymm2
        vpxor   %ymm4,%ymm11,%ymm9
        vpaddd  %ymm5,%ymm13,%ymm13

        vpxor   %ymm1,%ymm7,%ymm7
        vpxor   %ymm2,%ymm7,%ymm7

        vpaddd  %ymm5,%ymm9,%ymm9
        vpaddd  %ymm7,%ymm9,%ymm9
        vmovd   60(%r12),%xmm5
        leaq    64(%r12),%r12
        vmovd   60(%r8),%xmm0
        leaq    64(%r8),%r8
        vmovd   60(%r13),%xmm1
        leaq    64(%r13),%r13
        vmovd   60(%r9),%xmm2
        leaq    64(%r9),%r9
        vpinsrd $1,60(%r14),%xmm5,%xmm5
        leaq    64(%r14),%r14
        vpinsrd $1,60(%r10),%xmm0,%xmm0
        leaq    64(%r10),%r10
        vpinsrd $1,60(%r15),%xmm1,%xmm1
        leaq    64(%r15),%r15
        vpunpckldq      %ymm1,%ymm5,%ymm5
        vpinsrd $1,60(%r11),%xmm2,%xmm2
        leaq    64(%r11),%r11
        vpunpckldq      %ymm2,%ymm0,%ymm0
        vinserti128     $1,%xmm0,%ymm5,%ymm5
        vpshufb %ymm6,%ymm5,%ymm5
        vpsrld  $6,%ymm13,%ymm7
        vpslld  $26,%ymm13,%ymm2
        vmovdqu %ymm5,480-256-128(%rbx)
        vpaddd  %ymm8,%ymm5,%ymm5

        vpsrld  $11,%ymm13,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $21,%ymm13,%ymm2
        vpaddd  96(%rbp),%ymm5,%ymm5
        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $25,%ymm13,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        prefetcht0      63(%r12)
        vpslld  $7,%ymm13,%ymm2
        vpandn  %ymm15,%ymm13,%ymm0
        vpand   %ymm14,%ymm13,%ymm4
        prefetcht0      63(%r13)
        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $2,%ymm9,%ymm8
        vpxor   %ymm2,%ymm7,%ymm7
        prefetcht0      63(%r14)
        vpslld  $30,%ymm9,%ymm1
        vpxor   %ymm4,%ymm0,%ymm0
        vpxor   %ymm9,%ymm10,%ymm4
        prefetcht0      63(%r15)
        vpxor   %ymm1,%ymm8,%ymm8
        vpaddd  %ymm7,%ymm5,%ymm5

        vpsrld  $13,%ymm9,%ymm1
        prefetcht0      63(%r8)
        vpslld  $19,%ymm9,%ymm2
        vpaddd  %ymm0,%ymm5,%ymm5
        vpand   %ymm4,%ymm3,%ymm3
        prefetcht0      63(%r9)
        vpxor   %ymm1,%ymm8,%ymm7

        vpsrld  $22,%ymm9,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        prefetcht0      63(%r10)
        vpslld  $10,%ymm9,%ymm2
        vpxor   %ymm3,%ymm10,%ymm8
        vpaddd  %ymm5,%ymm12,%ymm12
        prefetcht0      63(%r11)
        vpxor   %ymm1,%ymm7,%ymm7
        vpxor   %ymm2,%ymm7,%ymm7

        vpaddd  %ymm5,%ymm8,%ymm8
        vpaddd  %ymm7,%ymm8,%ymm8
        addq    $256,%rbp
        vmovdqu 0-128(%rax),%ymm5
        movl    $3,%ecx
        jmp     .Loop_16_xx_avx2
.align  32
.Loop_16_xx_avx2:
        vmovdqu 32-128(%rax),%ymm6
        vpaddd  288-256-128(%rbx),%ymm5,%ymm5

        vpsrld  $3,%ymm6,%ymm7
        vpsrld  $7,%ymm6,%ymm1
        vpslld  $25,%ymm6,%ymm2
        vpxor   %ymm1,%ymm7,%ymm7
        vpsrld  $18,%ymm6,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $14,%ymm6,%ymm2
        vmovdqu 448-256-128(%rbx),%ymm0
        vpsrld  $10,%ymm0,%ymm3

        vpxor   %ymm1,%ymm7,%ymm7
        vpsrld  $17,%ymm0,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $15,%ymm0,%ymm2
        vpaddd  %ymm7,%ymm5,%ymm5
        vpxor   %ymm1,%ymm3,%ymm7
        vpsrld  $19,%ymm0,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $13,%ymm0,%ymm2
        vpxor   %ymm1,%ymm7,%ymm7
        vpxor   %ymm2,%ymm7,%ymm7
        vpaddd  %ymm7,%ymm5,%ymm5
        vpsrld  $6,%ymm12,%ymm7
        vpslld  $26,%ymm12,%ymm2
        vmovdqu %ymm5,0-128(%rax)
        vpaddd  %ymm15,%ymm5,%ymm5

        vpsrld  $11,%ymm12,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $21,%ymm12,%ymm2
        vpaddd  -128(%rbp),%ymm5,%ymm5
        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $25,%ymm12,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $7,%ymm12,%ymm2
        vpandn  %ymm14,%ymm12,%ymm0
        vpand   %ymm13,%ymm12,%ymm3

        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $2,%ymm8,%ymm15
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $30,%ymm8,%ymm1
        vpxor   %ymm3,%ymm0,%ymm0
        vpxor   %ymm8,%ymm9,%ymm3

        vpxor   %ymm1,%ymm15,%ymm15
        vpaddd  %ymm7,%ymm5,%ymm5

        vpsrld  $13,%ymm8,%ymm1

        vpslld  $19,%ymm8,%ymm2
        vpaddd  %ymm0,%ymm5,%ymm5
        vpand   %ymm3,%ymm4,%ymm4

        vpxor   %ymm1,%ymm15,%ymm7

        vpsrld  $22,%ymm8,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $10,%ymm8,%ymm2
        vpxor   %ymm4,%ymm9,%ymm15
        vpaddd  %ymm5,%ymm11,%ymm11

        vpxor   %ymm1,%ymm7,%ymm7
        vpxor   %ymm2,%ymm7,%ymm7

        vpaddd  %ymm5,%ymm15,%ymm15
        vpaddd  %ymm7,%ymm15,%ymm15
        vmovdqu 64-128(%rax),%ymm5
        vpaddd  320-256-128(%rbx),%ymm6,%ymm6

        vpsrld  $3,%ymm5,%ymm7
        vpsrld  $7,%ymm5,%ymm1
        vpslld  $25,%ymm5,%ymm2
        vpxor   %ymm1,%ymm7,%ymm7
        vpsrld  $18,%ymm5,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $14,%ymm5,%ymm2
        vmovdqu 480-256-128(%rbx),%ymm0
        vpsrld  $10,%ymm0,%ymm4

        vpxor   %ymm1,%ymm7,%ymm7
        vpsrld  $17,%ymm0,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $15,%ymm0,%ymm2
        vpaddd  %ymm7,%ymm6,%ymm6
        vpxor   %ymm1,%ymm4,%ymm7
        vpsrld  $19,%ymm0,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $13,%ymm0,%ymm2
        vpxor   %ymm1,%ymm7,%ymm7
        vpxor   %ymm2,%ymm7,%ymm7
        vpaddd  %ymm7,%ymm6,%ymm6
        vpsrld  $6,%ymm11,%ymm7
        vpslld  $26,%ymm11,%ymm2
        vmovdqu %ymm6,32-128(%rax)
        vpaddd  %ymm14,%ymm6,%ymm6

        vpsrld  $11,%ymm11,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $21,%ymm11,%ymm2
        vpaddd  -96(%rbp),%ymm6,%ymm6
        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $25,%ymm11,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $7,%ymm11,%ymm2
        vpandn  %ymm13,%ymm11,%ymm0
        vpand   %ymm12,%ymm11,%ymm4

        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $2,%ymm15,%ymm14
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $30,%ymm15,%ymm1
        vpxor   %ymm4,%ymm0,%ymm0
        vpxor   %ymm15,%ymm8,%ymm4

        vpxor   %ymm1,%ymm14,%ymm14
        vpaddd  %ymm7,%ymm6,%ymm6

        vpsrld  $13,%ymm15,%ymm1

        vpslld  $19,%ymm15,%ymm2
        vpaddd  %ymm0,%ymm6,%ymm6
        vpand   %ymm4,%ymm3,%ymm3

        vpxor   %ymm1,%ymm14,%ymm7

        vpsrld  $22,%ymm15,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $10,%ymm15,%ymm2
        vpxor   %ymm3,%ymm8,%ymm14
        vpaddd  %ymm6,%ymm10,%ymm10

        vpxor   %ymm1,%ymm7,%ymm7
        vpxor   %ymm2,%ymm7,%ymm7

        vpaddd  %ymm6,%ymm14,%ymm14
        vpaddd  %ymm7,%ymm14,%ymm14
        vmovdqu 96-128(%rax),%ymm6
        vpaddd  352-256-128(%rbx),%ymm5,%ymm5

        vpsrld  $3,%ymm6,%ymm7
        vpsrld  $7,%ymm6,%ymm1
        vpslld  $25,%ymm6,%ymm2
        vpxor   %ymm1,%ymm7,%ymm7
        vpsrld  $18,%ymm6,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $14,%ymm6,%ymm2
        vmovdqu 0-128(%rax),%ymm0
        vpsrld  $10,%ymm0,%ymm3

        vpxor   %ymm1,%ymm7,%ymm7
        vpsrld  $17,%ymm0,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $15,%ymm0,%ymm2
        vpaddd  %ymm7,%ymm5,%ymm5
        vpxor   %ymm1,%ymm3,%ymm7
        vpsrld  $19,%ymm0,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $13,%ymm0,%ymm2
        vpxor   %ymm1,%ymm7,%ymm7
        vpxor   %ymm2,%ymm7,%ymm7
        vpaddd  %ymm7,%ymm5,%ymm5
        vpsrld  $6,%ymm10,%ymm7
        vpslld  $26,%ymm10,%ymm2
        vmovdqu %ymm5,64-128(%rax)
        vpaddd  %ymm13,%ymm5,%ymm5

        vpsrld  $11,%ymm10,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $21,%ymm10,%ymm2
        vpaddd  -64(%rbp),%ymm5,%ymm5
        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $25,%ymm10,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $7,%ymm10,%ymm2
        vpandn  %ymm12,%ymm10,%ymm0
        vpand   %ymm11,%ymm10,%ymm3

        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $2,%ymm14,%ymm13
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $30,%ymm14,%ymm1
        vpxor   %ymm3,%ymm0,%ymm0
        vpxor   %ymm14,%ymm15,%ymm3

        vpxor   %ymm1,%ymm13,%ymm13
        vpaddd  %ymm7,%ymm5,%ymm5

        vpsrld  $13,%ymm14,%ymm1

        vpslld  $19,%ymm14,%ymm2
        vpaddd  %ymm0,%ymm5,%ymm5
        vpand   %ymm3,%ymm4,%ymm4

        vpxor   %ymm1,%ymm13,%ymm7

        vpsrld  $22,%ymm14,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $10,%ymm14,%ymm2
        vpxor   %ymm4,%ymm15,%ymm13
        vpaddd  %ymm5,%ymm9,%ymm9

        vpxor   %ymm1,%ymm7,%ymm7
        vpxor   %ymm2,%ymm7,%ymm7

        vpaddd  %ymm5,%ymm13,%ymm13
        vpaddd  %ymm7,%ymm13,%ymm13
        vmovdqu 128-128(%rax),%ymm5
        vpaddd  384-256-128(%rbx),%ymm6,%ymm6

        vpsrld  $3,%ymm5,%ymm7
        vpsrld  $7,%ymm5,%ymm1
        vpslld  $25,%ymm5,%ymm2
        vpxor   %ymm1,%ymm7,%ymm7
        vpsrld  $18,%ymm5,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $14,%ymm5,%ymm2
        vmovdqu 32-128(%rax),%ymm0
        vpsrld  $10,%ymm0,%ymm4

        vpxor   %ymm1,%ymm7,%ymm7
        vpsrld  $17,%ymm0,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $15,%ymm0,%ymm2
        vpaddd  %ymm7,%ymm6,%ymm6
        vpxor   %ymm1,%ymm4,%ymm7
        vpsrld  $19,%ymm0,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $13,%ymm0,%ymm2
        vpxor   %ymm1,%ymm7,%ymm7
        vpxor   %ymm2,%ymm7,%ymm7
        vpaddd  %ymm7,%ymm6,%ymm6
        vpsrld  $6,%ymm9,%ymm7
        vpslld  $26,%ymm9,%ymm2
        vmovdqu %ymm6,96-128(%rax)
        vpaddd  %ymm12,%ymm6,%ymm6

        vpsrld  $11,%ymm9,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $21,%ymm9,%ymm2
        vpaddd  -32(%rbp),%ymm6,%ymm6
        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $25,%ymm9,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $7,%ymm9,%ymm2
        vpandn  %ymm11,%ymm9,%ymm0
        vpand   %ymm10,%ymm9,%ymm4

        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $2,%ymm13,%ymm12
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $30,%ymm13,%ymm1
        vpxor   %ymm4,%ymm0,%ymm0
        vpxor   %ymm13,%ymm14,%ymm4

        vpxor   %ymm1,%ymm12,%ymm12
        vpaddd  %ymm7,%ymm6,%ymm6

        vpsrld  $13,%ymm13,%ymm1

        vpslld  $19,%ymm13,%ymm2
        vpaddd  %ymm0,%ymm6,%ymm6
        vpand   %ymm4,%ymm3,%ymm3

        vpxor   %ymm1,%ymm12,%ymm7

        vpsrld  $22,%ymm13,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $10,%ymm13,%ymm2
        vpxor   %ymm3,%ymm14,%ymm12
        vpaddd  %ymm6,%ymm8,%ymm8

        vpxor   %ymm1,%ymm7,%ymm7
        vpxor   %ymm2,%ymm7,%ymm7

        vpaddd  %ymm6,%ymm12,%ymm12
        vpaddd  %ymm7,%ymm12,%ymm12
        vmovdqu 160-128(%rax),%ymm6
        vpaddd  416-256-128(%rbx),%ymm5,%ymm5

        vpsrld  $3,%ymm6,%ymm7
        vpsrld  $7,%ymm6,%ymm1
        vpslld  $25,%ymm6,%ymm2
        vpxor   %ymm1,%ymm7,%ymm7
        vpsrld  $18,%ymm6,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $14,%ymm6,%ymm2
        vmovdqu 64-128(%rax),%ymm0
        vpsrld  $10,%ymm0,%ymm3

        vpxor   %ymm1,%ymm7,%ymm7
        vpsrld  $17,%ymm0,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $15,%ymm0,%ymm2
        vpaddd  %ymm7,%ymm5,%ymm5
        vpxor   %ymm1,%ymm3,%ymm7
        vpsrld  $19,%ymm0,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $13,%ymm0,%ymm2
        vpxor   %ymm1,%ymm7,%ymm7
        vpxor   %ymm2,%ymm7,%ymm7
        vpaddd  %ymm7,%ymm5,%ymm5
        vpsrld  $6,%ymm8,%ymm7
        vpslld  $26,%ymm8,%ymm2
        vmovdqu %ymm5,128-128(%rax)
        vpaddd  %ymm11,%ymm5,%ymm5

        vpsrld  $11,%ymm8,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $21,%ymm8,%ymm2
        vpaddd  0(%rbp),%ymm5,%ymm5
        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $25,%ymm8,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $7,%ymm8,%ymm2
        vpandn  %ymm10,%ymm8,%ymm0
        vpand   %ymm9,%ymm8,%ymm3

        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $2,%ymm12,%ymm11
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $30,%ymm12,%ymm1
        vpxor   %ymm3,%ymm0,%ymm0
        vpxor   %ymm12,%ymm13,%ymm3

        vpxor   %ymm1,%ymm11,%ymm11
        vpaddd  %ymm7,%ymm5,%ymm5

        vpsrld  $13,%ymm12,%ymm1

        vpslld  $19,%ymm12,%ymm2
        vpaddd  %ymm0,%ymm5,%ymm5
        vpand   %ymm3,%ymm4,%ymm4

        vpxor   %ymm1,%ymm11,%ymm7

        vpsrld  $22,%ymm12,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $10,%ymm12,%ymm2
        vpxor   %ymm4,%ymm13,%ymm11
        vpaddd  %ymm5,%ymm15,%ymm15

        vpxor   %ymm1,%ymm7,%ymm7
        vpxor   %ymm2,%ymm7,%ymm7

        vpaddd  %ymm5,%ymm11,%ymm11
        vpaddd  %ymm7,%ymm11,%ymm11
        vmovdqu 192-128(%rax),%ymm5
        vpaddd  448-256-128(%rbx),%ymm6,%ymm6

        vpsrld  $3,%ymm5,%ymm7
        vpsrld  $7,%ymm5,%ymm1
        vpslld  $25,%ymm5,%ymm2
        vpxor   %ymm1,%ymm7,%ymm7
        vpsrld  $18,%ymm5,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $14,%ymm5,%ymm2
        vmovdqu 96-128(%rax),%ymm0
        vpsrld  $10,%ymm0,%ymm4

        vpxor   %ymm1,%ymm7,%ymm7
        vpsrld  $17,%ymm0,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $15,%ymm0,%ymm2
        vpaddd  %ymm7,%ymm6,%ymm6
        vpxor   %ymm1,%ymm4,%ymm7
        vpsrld  $19,%ymm0,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $13,%ymm0,%ymm2
        vpxor   %ymm1,%ymm7,%ymm7
        vpxor   %ymm2,%ymm7,%ymm7
        vpaddd  %ymm7,%ymm6,%ymm6
        vpsrld  $6,%ymm15,%ymm7
        vpslld  $26,%ymm15,%ymm2
        vmovdqu %ymm6,160-128(%rax)
        vpaddd  %ymm10,%ymm6,%ymm6

        vpsrld  $11,%ymm15,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $21,%ymm15,%ymm2
        vpaddd  32(%rbp),%ymm6,%ymm6
        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $25,%ymm15,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $7,%ymm15,%ymm2
        vpandn  %ymm9,%ymm15,%ymm0
        vpand   %ymm8,%ymm15,%ymm4

        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $2,%ymm11,%ymm10
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $30,%ymm11,%ymm1
        vpxor   %ymm4,%ymm0,%ymm0
        vpxor   %ymm11,%ymm12,%ymm4

        vpxor   %ymm1,%ymm10,%ymm10
        vpaddd  %ymm7,%ymm6,%ymm6

        vpsrld  $13,%ymm11,%ymm1

        vpslld  $19,%ymm11,%ymm2
        vpaddd  %ymm0,%ymm6,%ymm6
        vpand   %ymm4,%ymm3,%ymm3

        vpxor   %ymm1,%ymm10,%ymm7

        vpsrld  $22,%ymm11,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $10,%ymm11,%ymm2
        vpxor   %ymm3,%ymm12,%ymm10
        vpaddd  %ymm6,%ymm14,%ymm14

        vpxor   %ymm1,%ymm7,%ymm7
        vpxor   %ymm2,%ymm7,%ymm7

        vpaddd  %ymm6,%ymm10,%ymm10
        vpaddd  %ymm7,%ymm10,%ymm10
        vmovdqu 224-128(%rax),%ymm6
        vpaddd  480-256-128(%rbx),%ymm5,%ymm5

        vpsrld  $3,%ymm6,%ymm7
        vpsrld  $7,%ymm6,%ymm1
        vpslld  $25,%ymm6,%ymm2
        vpxor   %ymm1,%ymm7,%ymm7
        vpsrld  $18,%ymm6,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $14,%ymm6,%ymm2
        vmovdqu 128-128(%rax),%ymm0
        vpsrld  $10,%ymm0,%ymm3

        vpxor   %ymm1,%ymm7,%ymm7
        vpsrld  $17,%ymm0,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $15,%ymm0,%ymm2
        vpaddd  %ymm7,%ymm5,%ymm5
        vpxor   %ymm1,%ymm3,%ymm7
        vpsrld  $19,%ymm0,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $13,%ymm0,%ymm2
        vpxor   %ymm1,%ymm7,%ymm7
        vpxor   %ymm2,%ymm7,%ymm7
        vpaddd  %ymm7,%ymm5,%ymm5
        vpsrld  $6,%ymm14,%ymm7
        vpslld  $26,%ymm14,%ymm2
        vmovdqu %ymm5,192-128(%rax)
        vpaddd  %ymm9,%ymm5,%ymm5

        vpsrld  $11,%ymm14,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $21,%ymm14,%ymm2
        vpaddd  64(%rbp),%ymm5,%ymm5
        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $25,%ymm14,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $7,%ymm14,%ymm2
        vpandn  %ymm8,%ymm14,%ymm0
        vpand   %ymm15,%ymm14,%ymm3

        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $2,%ymm10,%ymm9
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $30,%ymm10,%ymm1
        vpxor   %ymm3,%ymm0,%ymm0
        vpxor   %ymm10,%ymm11,%ymm3

        vpxor   %ymm1,%ymm9,%ymm9
        vpaddd  %ymm7,%ymm5,%ymm5

        vpsrld  $13,%ymm10,%ymm1

        vpslld  $19,%ymm10,%ymm2
        vpaddd  %ymm0,%ymm5,%ymm5
        vpand   %ymm3,%ymm4,%ymm4

        vpxor   %ymm1,%ymm9,%ymm7

        vpsrld  $22,%ymm10,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $10,%ymm10,%ymm2
        vpxor   %ymm4,%ymm11,%ymm9
        vpaddd  %ymm5,%ymm13,%ymm13

        vpxor   %ymm1,%ymm7,%ymm7
        vpxor   %ymm2,%ymm7,%ymm7

        vpaddd  %ymm5,%ymm9,%ymm9
        vpaddd  %ymm7,%ymm9,%ymm9
        vmovdqu 256-256-128(%rbx),%ymm5
        vpaddd  0-128(%rax),%ymm6,%ymm6

        vpsrld  $3,%ymm5,%ymm7
        vpsrld  $7,%ymm5,%ymm1
        vpslld  $25,%ymm5,%ymm2
        vpxor   %ymm1,%ymm7,%ymm7
        vpsrld  $18,%ymm5,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $14,%ymm5,%ymm2
        vmovdqu 160-128(%rax),%ymm0
        vpsrld  $10,%ymm0,%ymm4

        vpxor   %ymm1,%ymm7,%ymm7
        vpsrld  $17,%ymm0,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $15,%ymm0,%ymm2
        vpaddd  %ymm7,%ymm6,%ymm6
        vpxor   %ymm1,%ymm4,%ymm7
        vpsrld  $19,%ymm0,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $13,%ymm0,%ymm2
        vpxor   %ymm1,%ymm7,%ymm7
        vpxor   %ymm2,%ymm7,%ymm7
        vpaddd  %ymm7,%ymm6,%ymm6
        vpsrld  $6,%ymm13,%ymm7
        vpslld  $26,%ymm13,%ymm2
        vmovdqu %ymm6,224-128(%rax)
        vpaddd  %ymm8,%ymm6,%ymm6

        vpsrld  $11,%ymm13,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $21,%ymm13,%ymm2
        vpaddd  96(%rbp),%ymm6,%ymm6
        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $25,%ymm13,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $7,%ymm13,%ymm2
        vpandn  %ymm15,%ymm13,%ymm0
        vpand   %ymm14,%ymm13,%ymm4

        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $2,%ymm9,%ymm8
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $30,%ymm9,%ymm1
        vpxor   %ymm4,%ymm0,%ymm0
        vpxor   %ymm9,%ymm10,%ymm4

        vpxor   %ymm1,%ymm8,%ymm8
        vpaddd  %ymm7,%ymm6,%ymm6

        vpsrld  $13,%ymm9,%ymm1

        vpslld  $19,%ymm9,%ymm2
        vpaddd  %ymm0,%ymm6,%ymm6
        vpand   %ymm4,%ymm3,%ymm3

        vpxor   %ymm1,%ymm8,%ymm7

        vpsrld  $22,%ymm9,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $10,%ymm9,%ymm2
        vpxor   %ymm3,%ymm10,%ymm8
        vpaddd  %ymm6,%ymm12,%ymm12

        vpxor   %ymm1,%ymm7,%ymm7
        vpxor   %ymm2,%ymm7,%ymm7

        vpaddd  %ymm6,%ymm8,%ymm8
        vpaddd  %ymm7,%ymm8,%ymm8
        addq    $256,%rbp
        vmovdqu 288-256-128(%rbx),%ymm6
        vpaddd  32-128(%rax),%ymm5,%ymm5

        vpsrld  $3,%ymm6,%ymm7
        vpsrld  $7,%ymm6,%ymm1
        vpslld  $25,%ymm6,%ymm2
        vpxor   %ymm1,%ymm7,%ymm7
        vpsrld  $18,%ymm6,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $14,%ymm6,%ymm2
        vmovdqu 192-128(%rax),%ymm0
        vpsrld  $10,%ymm0,%ymm3

        vpxor   %ymm1,%ymm7,%ymm7
        vpsrld  $17,%ymm0,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $15,%ymm0,%ymm2
        vpaddd  %ymm7,%ymm5,%ymm5
        vpxor   %ymm1,%ymm3,%ymm7
        vpsrld  $19,%ymm0,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $13,%ymm0,%ymm2
        vpxor   %ymm1,%ymm7,%ymm7
        vpxor   %ymm2,%ymm7,%ymm7
        vpaddd  %ymm7,%ymm5,%ymm5
        vpsrld  $6,%ymm12,%ymm7
        vpslld  $26,%ymm12,%ymm2
        vmovdqu %ymm5,256-256-128(%rbx)
        vpaddd  %ymm15,%ymm5,%ymm5

        vpsrld  $11,%ymm12,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $21,%ymm12,%ymm2
        vpaddd  -128(%rbp),%ymm5,%ymm5
        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $25,%ymm12,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $7,%ymm12,%ymm2
        vpandn  %ymm14,%ymm12,%ymm0
        vpand   %ymm13,%ymm12,%ymm3

        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $2,%ymm8,%ymm15
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $30,%ymm8,%ymm1
        vpxor   %ymm3,%ymm0,%ymm0
        vpxor   %ymm8,%ymm9,%ymm3

        vpxor   %ymm1,%ymm15,%ymm15
        vpaddd  %ymm7,%ymm5,%ymm5

        vpsrld  $13,%ymm8,%ymm1

        vpslld  $19,%ymm8,%ymm2
        vpaddd  %ymm0,%ymm5,%ymm5
        vpand   %ymm3,%ymm4,%ymm4

        vpxor   %ymm1,%ymm15,%ymm7

        vpsrld  $22,%ymm8,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $10,%ymm8,%ymm2
        vpxor   %ymm4,%ymm9,%ymm15
        vpaddd  %ymm5,%ymm11,%ymm11

        vpxor   %ymm1,%ymm7,%ymm7
        vpxor   %ymm2,%ymm7,%ymm7

        vpaddd  %ymm5,%ymm15,%ymm15
        vpaddd  %ymm7,%ymm15,%ymm15
        vmovdqu 320-256-128(%rbx),%ymm5
        vpaddd  64-128(%rax),%ymm6,%ymm6

        vpsrld  $3,%ymm5,%ymm7
        vpsrld  $7,%ymm5,%ymm1
        vpslld  $25,%ymm5,%ymm2
        vpxor   %ymm1,%ymm7,%ymm7
        vpsrld  $18,%ymm5,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $14,%ymm5,%ymm2
        vmovdqu 224-128(%rax),%ymm0
        vpsrld  $10,%ymm0,%ymm4

        vpxor   %ymm1,%ymm7,%ymm7
        vpsrld  $17,%ymm0,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $15,%ymm0,%ymm2
        vpaddd  %ymm7,%ymm6,%ymm6
        vpxor   %ymm1,%ymm4,%ymm7
        vpsrld  $19,%ymm0,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $13,%ymm0,%ymm2
        vpxor   %ymm1,%ymm7,%ymm7
        vpxor   %ymm2,%ymm7,%ymm7
        vpaddd  %ymm7,%ymm6,%ymm6
        vpsrld  $6,%ymm11,%ymm7
        vpslld  $26,%ymm11,%ymm2
        vmovdqu %ymm6,288-256-128(%rbx)
        vpaddd  %ymm14,%ymm6,%ymm6

        vpsrld  $11,%ymm11,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $21,%ymm11,%ymm2
        vpaddd  -96(%rbp),%ymm6,%ymm6
        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $25,%ymm11,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $7,%ymm11,%ymm2
        vpandn  %ymm13,%ymm11,%ymm0
        vpand   %ymm12,%ymm11,%ymm4

        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $2,%ymm15,%ymm14
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $30,%ymm15,%ymm1
        vpxor   %ymm4,%ymm0,%ymm0
        vpxor   %ymm15,%ymm8,%ymm4

        vpxor   %ymm1,%ymm14,%ymm14
        vpaddd  %ymm7,%ymm6,%ymm6

        vpsrld  $13,%ymm15,%ymm1

        vpslld  $19,%ymm15,%ymm2
        vpaddd  %ymm0,%ymm6,%ymm6
        vpand   %ymm4,%ymm3,%ymm3

        vpxor   %ymm1,%ymm14,%ymm7

        vpsrld  $22,%ymm15,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $10,%ymm15,%ymm2
        vpxor   %ymm3,%ymm8,%ymm14
        vpaddd  %ymm6,%ymm10,%ymm10

        vpxor   %ymm1,%ymm7,%ymm7
        vpxor   %ymm2,%ymm7,%ymm7

        vpaddd  %ymm6,%ymm14,%ymm14
        vpaddd  %ymm7,%ymm14,%ymm14
        vmovdqu 352-256-128(%rbx),%ymm6
        vpaddd  96-128(%rax),%ymm5,%ymm5

        vpsrld  $3,%ymm6,%ymm7
        vpsrld  $7,%ymm6,%ymm1
        vpslld  $25,%ymm6,%ymm2
        vpxor   %ymm1,%ymm7,%ymm7
        vpsrld  $18,%ymm6,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $14,%ymm6,%ymm2
        vmovdqu 256-256-128(%rbx),%ymm0
        vpsrld  $10,%ymm0,%ymm3

        vpxor   %ymm1,%ymm7,%ymm7
        vpsrld  $17,%ymm0,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $15,%ymm0,%ymm2
        vpaddd  %ymm7,%ymm5,%ymm5
        vpxor   %ymm1,%ymm3,%ymm7
        vpsrld  $19,%ymm0,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $13,%ymm0,%ymm2
        vpxor   %ymm1,%ymm7,%ymm7
        vpxor   %ymm2,%ymm7,%ymm7
        vpaddd  %ymm7,%ymm5,%ymm5
        vpsrld  $6,%ymm10,%ymm7
        vpslld  $26,%ymm10,%ymm2
        vmovdqu %ymm5,320-256-128(%rbx)
        vpaddd  %ymm13,%ymm5,%ymm5

        vpsrld  $11,%ymm10,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $21,%ymm10,%ymm2
        vpaddd  -64(%rbp),%ymm5,%ymm5
        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $25,%ymm10,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $7,%ymm10,%ymm2
        vpandn  %ymm12,%ymm10,%ymm0
        vpand   %ymm11,%ymm10,%ymm3

        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $2,%ymm14,%ymm13
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $30,%ymm14,%ymm1
        vpxor   %ymm3,%ymm0,%ymm0
        vpxor   %ymm14,%ymm15,%ymm3

        vpxor   %ymm1,%ymm13,%ymm13
        vpaddd  %ymm7,%ymm5,%ymm5

        vpsrld  $13,%ymm14,%ymm1

        vpslld  $19,%ymm14,%ymm2
        vpaddd  %ymm0,%ymm5,%ymm5
        vpand   %ymm3,%ymm4,%ymm4

        vpxor   %ymm1,%ymm13,%ymm7

        vpsrld  $22,%ymm14,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $10,%ymm14,%ymm2
        vpxor   %ymm4,%ymm15,%ymm13
        vpaddd  %ymm5,%ymm9,%ymm9

        vpxor   %ymm1,%ymm7,%ymm7
        vpxor   %ymm2,%ymm7,%ymm7

        vpaddd  %ymm5,%ymm13,%ymm13
        vpaddd  %ymm7,%ymm13,%ymm13
        vmovdqu 384-256-128(%rbx),%ymm5
        vpaddd  128-128(%rax),%ymm6,%ymm6

        vpsrld  $3,%ymm5,%ymm7
        vpsrld  $7,%ymm5,%ymm1
        vpslld  $25,%ymm5,%ymm2
        vpxor   %ymm1,%ymm7,%ymm7
        vpsrld  $18,%ymm5,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $14,%ymm5,%ymm2
        vmovdqu 288-256-128(%rbx),%ymm0
        vpsrld  $10,%ymm0,%ymm4

        vpxor   %ymm1,%ymm7,%ymm7
        vpsrld  $17,%ymm0,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $15,%ymm0,%ymm2
        vpaddd  %ymm7,%ymm6,%ymm6
        vpxor   %ymm1,%ymm4,%ymm7
        vpsrld  $19,%ymm0,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $13,%ymm0,%ymm2
        vpxor   %ymm1,%ymm7,%ymm7
        vpxor   %ymm2,%ymm7,%ymm7
        vpaddd  %ymm7,%ymm6,%ymm6
        vpsrld  $6,%ymm9,%ymm7
        vpslld  $26,%ymm9,%ymm2
        vmovdqu %ymm6,352-256-128(%rbx)
        vpaddd  %ymm12,%ymm6,%ymm6

        vpsrld  $11,%ymm9,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $21,%ymm9,%ymm2
        vpaddd  -32(%rbp),%ymm6,%ymm6
        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $25,%ymm9,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $7,%ymm9,%ymm2
        vpandn  %ymm11,%ymm9,%ymm0
        vpand   %ymm10,%ymm9,%ymm4

        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $2,%ymm13,%ymm12
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $30,%ymm13,%ymm1
        vpxor   %ymm4,%ymm0,%ymm0
        vpxor   %ymm13,%ymm14,%ymm4

        vpxor   %ymm1,%ymm12,%ymm12
        vpaddd  %ymm7,%ymm6,%ymm6

        vpsrld  $13,%ymm13,%ymm1

        vpslld  $19,%ymm13,%ymm2
        vpaddd  %ymm0,%ymm6,%ymm6
        vpand   %ymm4,%ymm3,%ymm3

        vpxor   %ymm1,%ymm12,%ymm7

        vpsrld  $22,%ymm13,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $10,%ymm13,%ymm2
        vpxor   %ymm3,%ymm14,%ymm12
        vpaddd  %ymm6,%ymm8,%ymm8

        vpxor   %ymm1,%ymm7,%ymm7
        vpxor   %ymm2,%ymm7,%ymm7

        vpaddd  %ymm6,%ymm12,%ymm12
        vpaddd  %ymm7,%ymm12,%ymm12
        vmovdqu 416-256-128(%rbx),%ymm6
        vpaddd  160-128(%rax),%ymm5,%ymm5

        vpsrld  $3,%ymm6,%ymm7
        vpsrld  $7,%ymm6,%ymm1
        vpslld  $25,%ymm6,%ymm2
        vpxor   %ymm1,%ymm7,%ymm7
        vpsrld  $18,%ymm6,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $14,%ymm6,%ymm2
        vmovdqu 320-256-128(%rbx),%ymm0
        vpsrld  $10,%ymm0,%ymm3

        vpxor   %ymm1,%ymm7,%ymm7
        vpsrld  $17,%ymm0,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $15,%ymm0,%ymm2
        vpaddd  %ymm7,%ymm5,%ymm5
        vpxor   %ymm1,%ymm3,%ymm7
        vpsrld  $19,%ymm0,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $13,%ymm0,%ymm2
        vpxor   %ymm1,%ymm7,%ymm7
        vpxor   %ymm2,%ymm7,%ymm7
        vpaddd  %ymm7,%ymm5,%ymm5
        vpsrld  $6,%ymm8,%ymm7
        vpslld  $26,%ymm8,%ymm2
        vmovdqu %ymm5,384-256-128(%rbx)
        vpaddd  %ymm11,%ymm5,%ymm5

        vpsrld  $11,%ymm8,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $21,%ymm8,%ymm2
        vpaddd  0(%rbp),%ymm5,%ymm5
        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $25,%ymm8,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $7,%ymm8,%ymm2
        vpandn  %ymm10,%ymm8,%ymm0
        vpand   %ymm9,%ymm8,%ymm3

        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $2,%ymm12,%ymm11
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $30,%ymm12,%ymm1
        vpxor   %ymm3,%ymm0,%ymm0
        vpxor   %ymm12,%ymm13,%ymm3

        vpxor   %ymm1,%ymm11,%ymm11
        vpaddd  %ymm7,%ymm5,%ymm5

        vpsrld  $13,%ymm12,%ymm1

        vpslld  $19,%ymm12,%ymm2
        vpaddd  %ymm0,%ymm5,%ymm5
        vpand   %ymm3,%ymm4,%ymm4

        vpxor   %ymm1,%ymm11,%ymm7

        vpsrld  $22,%ymm12,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $10,%ymm12,%ymm2
        vpxor   %ymm4,%ymm13,%ymm11
        vpaddd  %ymm5,%ymm15,%ymm15

        vpxor   %ymm1,%ymm7,%ymm7
        vpxor   %ymm2,%ymm7,%ymm7

        vpaddd  %ymm5,%ymm11,%ymm11
        vpaddd  %ymm7,%ymm11,%ymm11
        vmovdqu 448-256-128(%rbx),%ymm5
        vpaddd  192-128(%rax),%ymm6,%ymm6

        vpsrld  $3,%ymm5,%ymm7
        vpsrld  $7,%ymm5,%ymm1
        vpslld  $25,%ymm5,%ymm2
        vpxor   %ymm1,%ymm7,%ymm7
        vpsrld  $18,%ymm5,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $14,%ymm5,%ymm2
        vmovdqu 352-256-128(%rbx),%ymm0
        vpsrld  $10,%ymm0,%ymm4

        vpxor   %ymm1,%ymm7,%ymm7
        vpsrld  $17,%ymm0,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $15,%ymm0,%ymm2
        vpaddd  %ymm7,%ymm6,%ymm6
        vpxor   %ymm1,%ymm4,%ymm7
        vpsrld  $19,%ymm0,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $13,%ymm0,%ymm2
        vpxor   %ymm1,%ymm7,%ymm7
        vpxor   %ymm2,%ymm7,%ymm7
        vpaddd  %ymm7,%ymm6,%ymm6
        vpsrld  $6,%ymm15,%ymm7
        vpslld  $26,%ymm15,%ymm2
        vmovdqu %ymm6,416-256-128(%rbx)
        vpaddd  %ymm10,%ymm6,%ymm6

        vpsrld  $11,%ymm15,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $21,%ymm15,%ymm2
        vpaddd  32(%rbp),%ymm6,%ymm6
        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $25,%ymm15,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $7,%ymm15,%ymm2
        vpandn  %ymm9,%ymm15,%ymm0
        vpand   %ymm8,%ymm15,%ymm4

        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $2,%ymm11,%ymm10
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $30,%ymm11,%ymm1
        vpxor   %ymm4,%ymm0,%ymm0
        vpxor   %ymm11,%ymm12,%ymm4

        vpxor   %ymm1,%ymm10,%ymm10
        vpaddd  %ymm7,%ymm6,%ymm6

        vpsrld  $13,%ymm11,%ymm1

        vpslld  $19,%ymm11,%ymm2
        vpaddd  %ymm0,%ymm6,%ymm6
        vpand   %ymm4,%ymm3,%ymm3

        vpxor   %ymm1,%ymm10,%ymm7

        vpsrld  $22,%ymm11,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $10,%ymm11,%ymm2
        vpxor   %ymm3,%ymm12,%ymm10
        vpaddd  %ymm6,%ymm14,%ymm14

        vpxor   %ymm1,%ymm7,%ymm7
        vpxor   %ymm2,%ymm7,%ymm7

        vpaddd  %ymm6,%ymm10,%ymm10
        vpaddd  %ymm7,%ymm10,%ymm10
        vmovdqu 480-256-128(%rbx),%ymm6
        vpaddd  224-128(%rax),%ymm5,%ymm5

        vpsrld  $3,%ymm6,%ymm7
        vpsrld  $7,%ymm6,%ymm1
        vpslld  $25,%ymm6,%ymm2
        vpxor   %ymm1,%ymm7,%ymm7
        vpsrld  $18,%ymm6,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $14,%ymm6,%ymm2
        vmovdqu 384-256-128(%rbx),%ymm0
        vpsrld  $10,%ymm0,%ymm3

        vpxor   %ymm1,%ymm7,%ymm7
        vpsrld  $17,%ymm0,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $15,%ymm0,%ymm2
        vpaddd  %ymm7,%ymm5,%ymm5
        vpxor   %ymm1,%ymm3,%ymm7
        vpsrld  $19,%ymm0,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $13,%ymm0,%ymm2
        vpxor   %ymm1,%ymm7,%ymm7
        vpxor   %ymm2,%ymm7,%ymm7
        vpaddd  %ymm7,%ymm5,%ymm5
        vpsrld  $6,%ymm14,%ymm7
        vpslld  $26,%ymm14,%ymm2
        vmovdqu %ymm5,448-256-128(%rbx)
        vpaddd  %ymm9,%ymm5,%ymm5

        vpsrld  $11,%ymm14,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $21,%ymm14,%ymm2
        vpaddd  64(%rbp),%ymm5,%ymm5
        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $25,%ymm14,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $7,%ymm14,%ymm2
        vpandn  %ymm8,%ymm14,%ymm0
        vpand   %ymm15,%ymm14,%ymm3

        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $2,%ymm10,%ymm9
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $30,%ymm10,%ymm1
        vpxor   %ymm3,%ymm0,%ymm0
        vpxor   %ymm10,%ymm11,%ymm3

        vpxor   %ymm1,%ymm9,%ymm9
        vpaddd  %ymm7,%ymm5,%ymm5

        vpsrld  $13,%ymm10,%ymm1

        vpslld  $19,%ymm10,%ymm2
        vpaddd  %ymm0,%ymm5,%ymm5
        vpand   %ymm3,%ymm4,%ymm4

        vpxor   %ymm1,%ymm9,%ymm7

        vpsrld  $22,%ymm10,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $10,%ymm10,%ymm2
        vpxor   %ymm4,%ymm11,%ymm9
        vpaddd  %ymm5,%ymm13,%ymm13

        vpxor   %ymm1,%ymm7,%ymm7
        vpxor   %ymm2,%ymm7,%ymm7

        vpaddd  %ymm5,%ymm9,%ymm9
        vpaddd  %ymm7,%ymm9,%ymm9
        vmovdqu 0-128(%rax),%ymm5
        vpaddd  256-256-128(%rbx),%ymm6,%ymm6

        vpsrld  $3,%ymm5,%ymm7
        vpsrld  $7,%ymm5,%ymm1
        vpslld  $25,%ymm5,%ymm2
        vpxor   %ymm1,%ymm7,%ymm7
        vpsrld  $18,%ymm5,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $14,%ymm5,%ymm2
        vmovdqu 416-256-128(%rbx),%ymm0
        vpsrld  $10,%ymm0,%ymm4

        vpxor   %ymm1,%ymm7,%ymm7
        vpsrld  $17,%ymm0,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $15,%ymm0,%ymm2
        vpaddd  %ymm7,%ymm6,%ymm6
        vpxor   %ymm1,%ymm4,%ymm7
        vpsrld  $19,%ymm0,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $13,%ymm0,%ymm2
        vpxor   %ymm1,%ymm7,%ymm7
        vpxor   %ymm2,%ymm7,%ymm7
        vpaddd  %ymm7,%ymm6,%ymm6
        vpsrld  $6,%ymm13,%ymm7
        vpslld  $26,%ymm13,%ymm2
        vmovdqu %ymm6,480-256-128(%rbx)
        vpaddd  %ymm8,%ymm6,%ymm6

        vpsrld  $11,%ymm13,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7
        vpslld  $21,%ymm13,%ymm2
        vpaddd  96(%rbp),%ymm6,%ymm6
        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $25,%ymm13,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $7,%ymm13,%ymm2
        vpandn  %ymm15,%ymm13,%ymm0
        vpand   %ymm14,%ymm13,%ymm4

        vpxor   %ymm1,%ymm7,%ymm7

        vpsrld  $2,%ymm9,%ymm8
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $30,%ymm9,%ymm1
        vpxor   %ymm4,%ymm0,%ymm0
        vpxor   %ymm9,%ymm10,%ymm4

        vpxor   %ymm1,%ymm8,%ymm8
        vpaddd  %ymm7,%ymm6,%ymm6

        vpsrld  $13,%ymm9,%ymm1

        vpslld  $19,%ymm9,%ymm2
        vpaddd  %ymm0,%ymm6,%ymm6
        vpand   %ymm4,%ymm3,%ymm3

        vpxor   %ymm1,%ymm8,%ymm7

        vpsrld  $22,%ymm9,%ymm1
        vpxor   %ymm2,%ymm7,%ymm7

        vpslld  $10,%ymm9,%ymm2
        vpxor   %ymm3,%ymm10,%ymm8
        vpaddd  %ymm6,%ymm12,%ymm12

        vpxor   %ymm1,%ymm7,%ymm7
        vpxor   %ymm2,%ymm7,%ymm7

        vpaddd  %ymm6,%ymm8,%ymm8
        vpaddd  %ymm7,%ymm8,%ymm8
        addq    $256,%rbp
        decl    %ecx
        jnz     .Loop_16_xx_avx2

        movl    $1,%ecx
        leaq    512(%rsp),%rbx
        leaq    K256+128(%rip),%rbp
        cmpl    0(%rbx),%ecx
        cmovgeq %rbp,%r12
        cmpl    4(%rbx),%ecx
        cmovgeq %rbp,%r13
        cmpl    8(%rbx),%ecx
        cmovgeq %rbp,%r14
        cmpl    12(%rbx),%ecx
        cmovgeq %rbp,%r15
        cmpl    16(%rbx),%ecx
        cmovgeq %rbp,%r8
        cmpl    20(%rbx),%ecx
        cmovgeq %rbp,%r9
        cmpl    24(%rbx),%ecx
        cmovgeq %rbp,%r10
        cmpl    28(%rbx),%ecx
        cmovgeq %rbp,%r11
        vmovdqa (%rbx),%ymm7
        vpxor   %ymm0,%ymm0,%ymm0
        vmovdqa %ymm7,%ymm6
        vpcmpgtd        %ymm0,%ymm6,%ymm6
        vpaddd  %ymm6,%ymm7,%ymm7

        vmovdqu 0-128(%rdi),%ymm0
        vpand   %ymm6,%ymm8,%ymm8
        vmovdqu 32-128(%rdi),%ymm1
        vpand   %ymm6,%ymm9,%ymm9
        vmovdqu 64-128(%rdi),%ymm2
        vpand   %ymm6,%ymm10,%ymm10
        vmovdqu 96-128(%rdi),%ymm5
        vpand   %ymm6,%ymm11,%ymm11
        vpaddd  %ymm0,%ymm8,%ymm8
        vmovdqu 128-128(%rdi),%ymm0
        vpand   %ymm6,%ymm12,%ymm12
        vpaddd  %ymm1,%ymm9,%ymm9
        vmovdqu 160-128(%rdi),%ymm1
        vpand   %ymm6,%ymm13,%ymm13
        vpaddd  %ymm2,%ymm10,%ymm10
        vmovdqu 192-128(%rdi),%ymm2
        vpand   %ymm6,%ymm14,%ymm14
        vpaddd  %ymm5,%ymm11,%ymm11
        vmovdqu 224-128(%rdi),%ymm5
        vpand   %ymm6,%ymm15,%ymm15
        vpaddd  %ymm0,%ymm12,%ymm12
        vpaddd  %ymm1,%ymm13,%ymm13
        vmovdqu %ymm8,0-128(%rdi)
        vpaddd  %ymm2,%ymm14,%ymm14
        vmovdqu %ymm9,32-128(%rdi)
        vpaddd  %ymm5,%ymm15,%ymm15
        vmovdqu %ymm10,64-128(%rdi)
        vmovdqu %ymm11,96-128(%rdi)
        vmovdqu %ymm12,128-128(%rdi)
        vmovdqu %ymm13,160-128(%rdi)
        vmovdqu %ymm14,192-128(%rdi)
        vmovdqu %ymm15,224-128(%rdi)

        vmovdqu %ymm7,(%rbx)
        leaq    256+128(%rsp),%rbx
        vmovdqu .Lpbswap(%rip),%ymm6
        decl    %edx
        jnz     .Loop_avx2







.Ldone_avx2:
        movq    544(%rsp),%rax
.cfi_def_cfa    %rax,8
        vzeroupper
        movq    -48(%rax),%r15
.cfi_restore    %r15
        movq    -40(%rax),%r14
.cfi_restore    %r14
        movq    -32(%rax),%r13
.cfi_restore    %r13
        movq    -24(%rax),%r12
.cfi_restore    %r12
        movq    -16(%rax),%rbp
.cfi_restore    %rbp
        movq    -8(%rax),%rbx
.cfi_restore    %rbx
        leaq    (%rax),%rsp
.cfi_def_cfa_register   %rsp
.Lepilogue_avx2:
        .byte   0xf3,0xc3
.cfi_endproc
.size   sha256_multi_block_avx2,.-sha256_multi_block_avx2
.align  256
K256:
.long   1116352408,1116352408,1116352408,1116352408
.long   1116352408,1116352408,1116352408,1116352408
.long   1899447441,1899447441,1899447441,1899447441
.long   1899447441,1899447441,1899447441,1899447441
.long   3049323471,3049323471,3049323471,3049323471
.long   3049323471,3049323471,3049323471,3049323471
.long   3921009573,3921009573,3921009573,3921009573
.long   3921009573,3921009573,3921009573,3921009573
.long   961987163,961987163,961987163,961987163
.long   961987163,961987163,961987163,961987163
.long   1508970993,1508970993,1508970993,1508970993
.long   1508970993,1508970993,1508970993,1508970993
.long   2453635748,2453635748,2453635748,2453635748
.long   2453635748,2453635748,2453635748,2453635748
.long   2870763221,2870763221,2870763221,2870763221
.long   2870763221,2870763221,2870763221,2870763221
.long   3624381080,3624381080,3624381080,3624381080
.long   3624381080,3624381080,3624381080,3624381080
.long   310598401,310598401,310598401,310598401
.long   310598401,310598401,310598401,310598401
.long   607225278,607225278,607225278,607225278
.long   607225278,607225278,607225278,607225278
.long   1426881987,1426881987,1426881987,1426881987
.long   1426881987,1426881987,1426881987,1426881987
.long   1925078388,1925078388,1925078388,1925078388
.long   1925078388,1925078388,1925078388,1925078388
.long   2162078206,2162078206,2162078206,2162078206
.long   2162078206,2162078206,2162078206,2162078206
.long   2614888103,2614888103,2614888103,2614888103
.long   2614888103,2614888103,2614888103,2614888103
.long   3248222580,3248222580,3248222580,3248222580
.long   3248222580,3248222580,3248222580,3248222580
.long   3835390401,3835390401,3835390401,3835390401
.long   3835390401,3835390401,3835390401,3835390401
.long   4022224774,4022224774,4022224774,4022224774
.long   4022224774,4022224774,4022224774,4022224774
.long   264347078,264347078,264347078,264347078
.long   264347078,264347078,264347078,264347078
.long   604807628,604807628,604807628,604807628
.long   604807628,604807628,604807628,604807628
.long   770255983,770255983,770255983,770255983
.long   770255983,770255983,770255983,770255983
.long   1249150122,1249150122,1249150122,1249150122
.long   1249150122,1249150122,1249150122,1249150122
.long   1555081692,1555081692,1555081692,1555081692
.long   1555081692,1555081692,1555081692,1555081692
.long   1996064986,1996064986,1996064986,1996064986
.long   1996064986,1996064986,1996064986,1996064986
.long   2554220882,2554220882,2554220882,2554220882
.long   2554220882,2554220882,2554220882,2554220882
.long   2821834349,2821834349,2821834349,2821834349
.long   2821834349,2821834349,2821834349,2821834349
.long   2952996808,2952996808,2952996808,2952996808
.long   2952996808,2952996808,2952996808,2952996808
.long   3210313671,3210313671,3210313671,3210313671
.long   3210313671,3210313671,3210313671,3210313671
.long   3336571891,3336571891,3336571891,3336571891
.long   3336571891,3336571891,3336571891,3336571891
.long   3584528711,3584528711,3584528711,3584528711
.long   3584528711,3584528711,3584528711,3584528711
.long   113926993,113926993,113926993,113926993
.long   113926993,113926993,113926993,113926993
.long   338241895,338241895,338241895,338241895
.long   338241895,338241895,338241895,338241895
.long   666307205,666307205,666307205,666307205
.long   666307205,666307205,666307205,666307205
.long   773529912,773529912,773529912,773529912
.long   773529912,773529912,773529912,773529912
.long   1294757372,1294757372,1294757372,1294757372
.long   1294757372,1294757372,1294757372,1294757372
.long   1396182291,1396182291,1396182291,1396182291
.long   1396182291,1396182291,1396182291,1396182291
.long   1695183700,1695183700,1695183700,1695183700
.long   1695183700,1695183700,1695183700,1695183700
.long   1986661051,1986661051,1986661051,1986661051
.long   1986661051,1986661051,1986661051,1986661051
.long   2177026350,2177026350,2177026350,2177026350
.long   2177026350,2177026350,2177026350,2177026350
.long   2456956037,2456956037,2456956037,2456956037
.long   2456956037,2456956037,2456956037,2456956037
.long   2730485921,2730485921,2730485921,2730485921
.long   2730485921,2730485921,2730485921,2730485921
.long   2820302411,2820302411,2820302411,2820302411
.long   2820302411,2820302411,2820302411,2820302411
.long   3259730800,3259730800,3259730800,3259730800
.long   3259730800,3259730800,3259730800,3259730800
.long   3345764771,3345764771,3345764771,3345764771
.long   3345764771,3345764771,3345764771,3345764771
.long   3516065817,3516065817,3516065817,3516065817
.long   3516065817,3516065817,3516065817,3516065817
.long   3600352804,3600352804,3600352804,3600352804
.long   3600352804,3600352804,3600352804,3600352804
.long   4094571909,4094571909,4094571909,4094571909
.long   4094571909,4094571909,4094571909,4094571909
.long   275423344,275423344,275423344,275423344
.long   275423344,275423344,275423344,275423344
.long   430227734,430227734,430227734,430227734
.long   430227734,430227734,430227734,430227734
.long   506948616,506948616,506948616,506948616
.long   506948616,506948616,506948616,506948616
.long   659060556,659060556,659060556,659060556
.long   659060556,659060556,659060556,659060556
.long   883997877,883997877,883997877,883997877
.long   883997877,883997877,883997877,883997877
.long   958139571,958139571,958139571,958139571
.long   958139571,958139571,958139571,958139571
.long   1322822218,1322822218,1322822218,1322822218
.long   1322822218,1322822218,1322822218,1322822218
.long   1537002063,1537002063,1537002063,1537002063
.long   1537002063,1537002063,1537002063,1537002063
.long   1747873779,1747873779,1747873779,1747873779
.long   1747873779,1747873779,1747873779,1747873779
.long   1955562222,1955562222,1955562222,1955562222
.long   1955562222,1955562222,1955562222,1955562222
.long   2024104815,2024104815,2024104815,2024104815
.long   2024104815,2024104815,2024104815,2024104815
.long   2227730452,2227730452,2227730452,2227730452
.long   2227730452,2227730452,2227730452,2227730452
.long   2361852424,2361852424,2361852424,2361852424
.long   2361852424,2361852424,2361852424,2361852424
.long   2428436474,2428436474,2428436474,2428436474
.long   2428436474,2428436474,2428436474,2428436474
.long   2756734187,2756734187,2756734187,2756734187
.long   2756734187,2756734187,2756734187,2756734187
.long   3204031479,3204031479,3204031479,3204031479
.long   3204031479,3204031479,3204031479,3204031479
.long   3329325298,3329325298,3329325298,3329325298
.long   3329325298,3329325298,3329325298,3329325298
.Lpbswap:
.long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
.long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
K256_shaext:
.long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
.long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
.long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
.long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
.long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
.long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
.long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
.long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
.long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
.long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
.long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
.long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
.long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
.long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
.long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
.long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
.byte   83,72,65,50,53,54,32,109,117,108,116,105,45,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
        .section ".note.gnu.property", "a"
        .p2align 3
        .long 1f - 0f
        .long 4f - 1f
        .long 5
0:
        # "GNU" encoded with .byte, since .asciz isn't supported
        # on Solaris.
        .byte 0x47
        .byte 0x4e
        .byte 0x55
        .byte 0
1:
        .p2align 3
        .long 0xc0000002
        .long 3f - 2f
2:
        .long 3
3:
        .p2align 3
4:
